Merge remote-tracking branch 'tip/auto-latest'
authorStephen Rothwell <sfr@canb.auug.org.au>
Tue, 13 Sep 2016 01:53:17 +0000 (11:53 +1000)
committerStephen Rothwell <sfr@canb.auug.org.au>
Tue, 13 Sep 2016 01:53:17 +0000 (11:53 +1000)
334 files changed:
Documentation/kernel-parameters.txt
Documentation/ko_KR/memory-barriers.txt [new file with mode: 0644]
Documentation/memory-barriers.txt
Documentation/scheduler/sched-deadline.txt
Documentation/trace/ftrace-design.txt
Documentation/trace/kprobetrace.txt
Documentation/trace/uprobetracer.txt
Documentation/x86/protection-keys.txt
MAINTAINERS
arch/Kconfig
arch/alpha/include/uapi/asm/mman.h
arch/arm/kernel/ftrace.c
arch/arm64/kernel/entry-ftrace.S
arch/arm64/kernel/ftrace.c
arch/blackfin/kernel/ftrace-entry.S
arch/blackfin/kernel/ftrace.c
arch/ia64/include/asm/thread_info.h
arch/microblaze/kernel/ftrace.c
arch/mips/include/uapi/asm/mman.h
arch/mips/kernel/ftrace.c
arch/parisc/include/uapi/asm/mman.h
arch/parisc/kernel/ftrace.c
arch/powerpc/kernel/ftrace.c
arch/powerpc/mm/mmu_context_nohash.c
arch/powerpc/platforms/powermac/smp.c
arch/s390/kernel/ftrace.c
arch/sh/kernel/ftrace.c
arch/sparc/Kconfig
arch/sparc/include/asm/ftrace.h
arch/sparc/kernel/ftrace.c
arch/sparc/kernel/smp_32.c
arch/tile/kernel/ftrace.c
arch/x86/Kconfig
arch/x86/boot/compressed/eboot.c
arch/x86/entry/entry_32.S
arch/x86/entry/entry_64.S
arch/x86/entry/syscalls/syscall_32.tbl
arch/x86/entry/syscalls/syscall_64.tbl
arch/x86/events/amd/uncore.c
arch/x86/events/core.c
arch/x86/events/intel/bts.c
arch/x86/events/intel/core.c
arch/x86/events/intel/cqm.c
arch/x86/events/intel/ds.c
arch/x86/events/intel/lbr.c
arch/x86/events/intel/rapl.c
arch/x86/events/intel/uncore.c
arch/x86/events/intel/uncore.h
arch/x86/events/intel/uncore_snb.c
arch/x86/events/intel/uncore_snbep.c
arch/x86/events/perf_event.h
arch/x86/include/asm/cpufeatures.h
arch/x86/include/asm/desc.h
arch/x86/include/asm/fpu/xstate.h
arch/x86/include/asm/ftrace.h
arch/x86/include/asm/hypervisor.h
arch/x86/include/asm/intel-family.h
arch/x86/include/asm/intel-mid.h
arch/x86/include/asm/intel_scu_ipc.h
arch/x86/include/asm/kaslr.h
arch/x86/include/asm/kdebug.h
arch/x86/include/asm/mmu.h
arch/x86/include/asm/mmu_context.h
arch/x86/include/asm/pgtable_64_types.h
arch/x86/include/asm/pkeys.h
arch/x86/include/asm/pmem.h
arch/x86/include/asm/processor.h
arch/x86/include/asm/realmode.h
arch/x86/include/asm/smp.h
arch/x86/include/asm/stacktrace.h
arch/x86/include/asm/string_64.h
arch/x86/include/asm/switch_to.h
arch/x86/include/asm/thread_info.h
arch/x86/include/asm/traps.h
arch/x86/kernel/acpi/boot.c
arch/x86/kernel/acpi/sleep.c
arch/x86/kernel/apic/apic.c
arch/x86/kernel/apic/apic_flat_64.c
arch/x86/kernel/apic/apic_noop.c
arch/x86/kernel/apic/bigsmp_32.c
arch/x86/kernel/apic/msi.c
arch/x86/kernel/apic/probe_32.c
arch/x86/kernel/apic/x2apic_cluster.c
arch/x86/kernel/apic/x2apic_phys.c
arch/x86/kernel/apic/x2apic_uv_x.c
arch/x86/kernel/asm-offsets.c
arch/x86/kernel/asm-offsets_32.c
arch/x86/kernel/asm-offsets_64.c
arch/x86/kernel/cpu/common.c
arch/x86/kernel/cpu/hypervisor.c
arch/x86/kernel/cpu/mcheck/mce.c
arch/x86/kernel/cpu/microcode/amd.c
arch/x86/kernel/cpu/mtrr/main.c
arch/x86/kernel/cpu/mtrr/mtrr.h
arch/x86/kernel/dumpstack.c
arch/x86/kernel/dumpstack_32.c
arch/x86/kernel/dumpstack_64.c
arch/x86/kernel/e820.c
arch/x86/kernel/fpu/core.c
arch/x86/kernel/fpu/xstate.c
arch/x86/kernel/ftrace.c
arch/x86/kernel/head_32.S
arch/x86/kernel/head_64.S
arch/x86/kernel/hpet.c
arch/x86/kernel/kgdb.c
arch/x86/kernel/ksysfs.c
arch/x86/kernel/kvm.c
arch/x86/kernel/kvmclock.c
arch/x86/kernel/mpparse.c
arch/x86/kernel/paravirt.c
arch/x86/kernel/process.c
arch/x86/kernel/process_32.c
arch/x86/kernel/process_64.c
arch/x86/kernel/ptrace.c
arch/x86/kernel/quirks.c
arch/x86/kernel/reboot.c
arch/x86/kernel/setup.c
arch/x86/kernel/setup_percpu.c
arch/x86/kernel/smpboot.c
arch/x86/kernel/traps.c
arch/x86/kernel/x86_init.c
arch/x86/kvm/svm.c
arch/x86/kvm/vmx.c
arch/x86/lib/memcpy_64.S
arch/x86/mm/amdtopology.c
arch/x86/mm/fault.c
arch/x86/mm/kaslr.c
arch/x86/mm/pkeys.c
arch/x86/mm/tlb.c
arch/x86/oprofile/backtrace.c
arch/x86/pci/pcbios.c
arch/x86/platform/atom/punit_atom_debug.c
arch/x86/platform/intel-mid/device_libs/Makefile
arch/x86/platform/intel-mid/device_libs/platform_bcm43xx.c [new file with mode: 0644]
arch/x86/platform/intel-mid/device_libs/platform_mrfld_sd.c [new file with mode: 0644]
arch/x86/platform/intel-mid/intel-mid.c
arch/x86/platform/intel-mid/pwr.c
arch/x86/xen/enlighten.c
arch/xtensa/include/uapi/asm/mman.h
drivers/acpi/internal.h
drivers/acpi/ioapic.c
drivers/acpi/pci_root.c
drivers/bus/arm-cci.c
drivers/bus/arm-ccn.c
drivers/bus/mips_cdmm.c
drivers/cpuidle/coupled.c
drivers/cpuidle/cpuidle-powernv.c
drivers/cpuidle/cpuidle-pseries.c
drivers/firmware/dcdbas.c
drivers/firmware/efi/efi.c
drivers/firmware/efi/libstub/efi-stub-helper.c
drivers/firmware/efi/libstub/fdt.c
drivers/firmware/efi/libstub/random.c
drivers/hwmon/dell-smm-hwmon.c
drivers/md/raid5.c
drivers/md/raid5.h
drivers/net/ethernet/marvell/mvneta.c
drivers/net/virtio_net.c
drivers/pci/pci-mid.c
drivers/pci/setup-bus.c
drivers/perf/arm_pmu.c
drivers/powercap/intel_rapl.c
fs/proc/base.c
include/linux/acpi.h
include/linux/bitmap.h
include/linux/compiler.h
include/linux/cpu.h
include/linux/cpuhotplug.h
include/linux/efi.h
include/linux/ftrace.h
include/linux/hypervisor.h [new file with mode: 0644]
include/linux/irq.h
include/linux/jump_label.h
include/linux/ktime.h
include/linux/percpu-rwsem.h
include/linux/perf/arm_pmu.h
include/linux/perf_event.h
include/linux/pkeys.h
include/linux/rcu_sync.h
include/linux/relay.h
include/linux/sched.h
include/linux/slab.h
include/linux/smp.h
include/linux/syscalls.h
include/linux/time64.h
include/trace/events/cpuhp.h
include/uapi/asm-generic/mman-common.h
include/uapi/asm-generic/unistd.h
include/xen/interface/sched.h
kernel/cgroup.c
kernel/cpu.c
kernel/events/core.c
kernel/events/ring_buffer.c
kernel/events/uprobes.c
kernel/fork.c
kernel/futex.c
kernel/hung_task.c
kernel/irq/chip.c
kernel/irq/generic-chip.c
kernel/irq/irqdomain.c
kernel/irq/manage.c
kernel/locking/percpu-rwsem.c
kernel/locking/qspinlock_paravirt.h
kernel/locking/qspinlock_stat.h
kernel/locking/rwsem-xadd.c
kernel/rcu/sync.c
kernel/relay.c
kernel/sched/core.c
kernel/sched/cpudeadline.c
kernel/sched/cpudeadline.h
kernel/sched/cputime.c
kernel/sched/deadline.c
kernel/sched/debug.c
kernel/sched/fair.c
kernel/sched/idle_task.c
kernel/sched/sched.h
kernel/sched/stats.h
kernel/smp.c
kernel/softirq.c
kernel/time/clocksource.c
kernel/time/hrtimer.c
kernel/time/time.c
kernel/time/timekeeping_debug.c
kernel/trace/Kconfig
kernel/trace/trace.c
kernel/trace/trace_functions_graph.c
kernel/trace/trace_kprobe.c
kernel/trace/trace_probe.c
kernel/trace/trace_probe.h
kernel/trace/trace_uprobe.c
kernel/up.c
lib/dma-debug.c
mm/mprotect.c
mm/page-writeback.c
mm/slab.c
mm/slub.c
tools/include/linux/coresight-pmu.h [new file with mode: 0644]
tools/include/linux/time64.h [new file with mode: 0644]
tools/lguest/lguest.c
tools/lib/api/fs/fs.c
tools/lib/api/fs/fs.h
tools/perf/Documentation/perf-config.txt
tools/perf/Documentation/perf-probe.txt
tools/perf/Documentation/perfconfig.example
tools/perf/MANIFEST
tools/perf/Makefile.perf
tools/perf/arch/arm/include/dwarf-regs-table.h [new file with mode: 0644]
tools/perf/arch/arm64/include/dwarf-regs-table.h [new file with mode: 0644]
tools/perf/arch/powerpc/include/dwarf-regs-table.h [new file with mode: 0644]
tools/perf/arch/powerpc/util/sym-handling.c
tools/perf/arch/s390/include/dwarf-regs-table.h [new file with mode: 0644]
tools/perf/arch/sh/include/dwarf-regs-table.h [new file with mode: 0644]
tools/perf/arch/sparc/include/dwarf-regs-table.h [new file with mode: 0644]
tools/perf/arch/x86/include/dwarf-regs-table.h [new file with mode: 0644]
tools/perf/arch/xtensa/include/dwarf-regs-table.h [new file with mode: 0644]
tools/perf/bench/futex-requeue.c
tools/perf/bench/futex-wake-parallel.c
tools/perf/bench/futex-wake.c
tools/perf/bench/mem-functions.c
tools/perf/bench/numa.c
tools/perf/bench/sched-messaging.c
tools/perf/bench/sched-pipe.c
tools/perf/builtin-annotate.c
tools/perf/builtin-diff.c
tools/perf/builtin-inject.c
tools/perf/builtin-kmem.c
tools/perf/builtin-kvm.c
tools/perf/builtin-probe.c
tools/perf/builtin-record.c
tools/perf/builtin-report.c
tools/perf/builtin-sched.c
tools/perf/builtin-script.c
tools/perf/builtin-stat.c
tools/perf/builtin-timechart.c
tools/perf/builtin-top.c
tools/perf/builtin-trace.c
tools/perf/perf-sys.h
tools/perf/perf.h
tools/perf/tests/backward-ring-buffer.c
tools/perf/tests/bpf.c
tools/perf/tests/code-reading.c
tools/perf/tests/vmlinux-kallsyms.c
tools/perf/ui/browsers/annotate.c
tools/perf/ui/browsers/hists.c
tools/perf/ui/browsers/map.c
tools/perf/ui/gtk/hists.c
tools/perf/ui/hist.c
tools/perf/ui/stdio/hist.c
tools/perf/util/Build
tools/perf/util/annotate.c
tools/perf/util/annotate.h
tools/perf/util/block-range.c [new file with mode: 0644]
tools/perf/util/block-range.h [new file with mode: 0644]
tools/perf/util/bpf-loader.c
tools/perf/util/debug.c
tools/perf/util/dwarf-aux.c
tools/perf/util/dwarf-aux.h
tools/perf/util/dwarf-regs.c [new file with mode: 0644]
tools/perf/util/event.c
tools/perf/util/evlist.c
tools/perf/util/evsel_fprintf.c
tools/perf/util/header.c
tools/perf/util/hist.h
tools/perf/util/include/dwarf-regs.h
tools/perf/util/intel-bts.c
tools/perf/util/intel-pt.c
tools/perf/util/lzma.c
tools/perf/util/machine.c
tools/perf/util/machine.h
tools/perf/util/map.c
tools/perf/util/map.h
tools/perf/util/pmu.c
tools/perf/util/probe-event.c
tools/perf/util/probe-event.h
tools/perf/util/probe-file.c
tools/perf/util/probe-file.h
tools/perf/util/probe-finder.c
tools/perf/util/probe-finder.h
tools/perf/util/scripting-engines/trace-event-perl.c
tools/perf/util/scripting-engines/trace-event-python.c
tools/perf/util/sort.c
tools/perf/util/sort.h
tools/perf/util/svghelper.c
tools/perf/util/symbol-elf.c
tools/perf/util/symbol-minimal.c
tools/perf/util/symbol.c
tools/perf/util/symbol.h
tools/perf/util/util.c
tools/perf/util/util.h
tools/testing/radix-tree/linux/cpu.h
tools/testing/selftests/x86/Makefile
tools/testing/selftests/x86/pkey-helpers.h [new file with mode: 0644]
tools/testing/selftests/x86/protection_keys.c [new file with mode: 0644]
tools/testing/selftests/x86/ptrace_syscall.c

index 73191f4fe5b6a41b9bee5ff08b336feea1f36817..b28015cad949571408b4e544dc1d1588be9d4a8f 100644 (file)
@@ -1652,6 +1652,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 
        initrd=         [BOOT] Specify the location of the initial ramdisk
 
+       init_pkru=      [x86] Specify the default memory protection keys rights
+                       register contents for all processes.  0x55555554 by
+                       default (disallow access to all but pkey 0).  Can
+                       override in debugfs after boot.
+
        inport.irq=     [HW] Inport (ATI XL and Microsoft) busmouse driver
                        Format: <irq>
 
diff --git a/Documentation/ko_KR/memory-barriers.txt b/Documentation/ko_KR/memory-barriers.txt
new file mode 100644 (file)
index 0000000..34d3d38
--- /dev/null
@@ -0,0 +1,3135 @@
+NOTE:
+This is a version of Documentation/memory-barriers.txt translated into Korean.
+This document is maintained by SeongJae Park <sj38.park@gmail.com>.
+If you find any difference between this document and the original file or
+a problem with the translation, please contact the maintainer of this file.
+
+Please also note that the purpose of this file is to be easier to
+read for non English (read: Korean) speakers and is not intended as
+a fork.  So if you have any comments or updates for this file please
+update the original English file first.  The English version is
+definitive, and readers should look there if they have any doubt.
+
+===================================
+이 문서는
+Documentation/memory-barriers.txt
+의 한글 번역입니다.
+
+역자: 박성재 <sj38.park@gmail.com>
+===================================
+
+
+                        =========================
+                        리눅스 커널 메모리 배리어
+                        =========================
+
+저자: David Howells <dhowells@redhat.com>
+      Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+      Will Deacon <will.deacon@arm.com>
+      Peter Zijlstra <peterz@infradead.org>
+
+========
+면책조항
+========
+
+이 문서는 명세서가 아닙니다; 이 문서는 완벽하지 않은데, 간결성을 위해 의도된
+부분도 있고, 의도하진 않았지만 사람에 의해 쓰였다보니 불완전한 부분도 있습니다.
+이 문서는 리눅스에서 제공하는 다양한 메모리 배리어들을 사용하기 위한
+안내서입니다만, 뭔가 이상하다 싶으면 (그런게 많을 겁니다) 질문을 부탁드립니다.
+
+다시 말하지만, 이 문서는 리눅스가 하드웨어에 기대하는 사항에 대한 명세서가
+아닙니다.
+
+이 문서의 목적은 두가지입니다:
+
+ (1) 어떤 특정 배리어에 대해 기대할 수 있는 최소한의 기능을 명세하기 위해서,
+     그리고
+
+ (2) 사용 가능한 배리어들에 대해 어떻게 사용해야 하는지에 대한 안내를 제공하기
+     위해서.
+
+어떤 아키텍쳐는 특정한 배리어들에 대해서는 여기서 이야기하는 최소한의
+요구사항들보다 많은 기능을 제공할 수도 있습니다만, 여기서 이야기하는
+요구사항들을 충족하지 않는 아키텍쳐가 있다면 그 아키텍쳐가 잘못된 것이란 점을
+알아두시기 바랍니다.
+
+또한, 특정 아키텍쳐에서 일부 배리어는 해당 아키텍쳐의 특수한 동작 방식으로 인해
+해당 배리어의 명시적 사용이 불필요해서 no-op 이 될수도 있음을 알아두시기
+바랍니다.
+
+역자: 본 번역 역시 완벽하지 않은데, 이 역시 부분적으로는 의도된 것이기도
+합니다.  여타 기술 문서들이 그렇듯 완벽한 이해를 위해서는 번역문과 원문을 함께
+읽으시되 번역문을 하나의 가이드로 활용하시길 추천드리며, 발견되는 오역 등에
+대해서는 언제든 의견을 부탁드립니다.  과한 번역으로 인한 오해를 최소화하기 위해
+애매한 부분이 있을 경우에는 어색함이 있더라도 원래의 용어를 차용합니다.
+
+
+=====
+목차:
+=====
+
+ (*) 추상 메모리 액세스 모델.
+
+     - 디바이스 오퍼레이션.
+     - 보장사항.
+
+ (*) 메모리 배리어란 무엇인가?
+
+     - 메모리 배리어의 종류.
+     - 메모리 배리어에 대해 가정해선 안될 것.
+     - 데이터 의존성 배리어.
+     - 컨트롤 의존성.
+     - SMP 배리어 짝맞추기.
+     - 메모리 배리어 시퀀스의 예.
+     - 읽기 메모리 배리어 vs 로드 예측.
+     - 이행성
+
+ (*) 명시적 커널 배리어.
+
+     - 컴파일러 배리어.
+     - CPU 메모리 배리어.
+     - MMIO 쓰기 배리어.
+
+ (*) 암묵적 커널 메모리 배리어.
+
+     - 락 Acquisition 함수.
+     - 인터럽트 비활성화 함수.
+     - 슬립과 웨이크업 함수.
+     - 그외의 함수들.
+
+ (*) CPU 간 ACQUIRING 배리어의 효과.
+
+     - Acquire vs 메모리 액세스.
+     - Acquire vs I/O 액세스.
+
+ (*) 메모리 배리어가 필요한 곳
+
+     - 프로세서간 상호 작용.
+     - 어토믹 오퍼레이션.
+     - 디바이스 액세스.
+     - 인터럽트.
+
+ (*) 커널 I/O 배리어의 효과.
+
+ (*) 가정되는 가장 완화된 실행 순서 모델.
+
+ (*) CPU 캐시의 영향.
+
+     - 캐시 일관성.
+     - 캐시 일관성 vs DMA.
+     - 캐시 일관성 vs MMIO.
+
+ (*) CPU 들이 저지르는 일들.
+
+     - 그리고, Alpha 가 있다.
+     - 가상 머신 게스트.
+
+ (*) 사용 예.
+
+     - 순환식 버퍼.
+
+ (*) 참고 문헌.
+
+
+=======================
+추상 메모리 액세스 모델
+=======================
+
+다음과 같이 추상화된 시스템 모델을 생각해 봅시다:
+
+                           :                :
+                           :                :
+                           :                :
+               +-------+   :   +--------+   :   +-------+
+               |       |   :   |        |   :   |       |
+               |       |   :   |        |   :   |       |
+               | CPU 1 |<----->| Memory |<----->| CPU 2 |
+               |       |   :   |        |   :   |       |
+               |       |   :   |        |   :   |       |
+               +-------+   :   +--------+   :   +-------+
+                   ^       :       ^        :       ^
+                   |       :       |        :       |
+                   |       :       |        :       |
+                   |       :       v        :       |
+                   |       :   +--------+   :       |
+                   |       :   |        |   :       |
+                   |       :   |        |   :       |
+                   +---------->| Device |<----------+
+                           :   |        |   :
+                           :   |        |   :
+                           :   +--------+   :
+                           :                :
+
+프로그램은 여러 메모리 액세스 오퍼레이션을 발생시키고, 각각의 CPU 는 그런
+프로그램들을 실행합니다.  추상화된 CPU 모델에서 메모리 오퍼레이션들의 순서는
+매우 완화되어 있고, CPU 는 프로그램이 인과관계를 어기지 않는 상태로 관리된다고
+보일 수만 있다면 메모리 오퍼레이션을 자신이 원하는 어떤 순서대로든 재배치해
+동작시킬 수 있습니다.  비슷하게, 컴파일러 또한 프로그램의 정상적 동작을 해치지
+않는 한도 내에서는 어떤 순서로든 자신이 원하는 대로 인스트럭션을 재배치 할 수
+있습니다.
+
+따라서 위의 다이어그램에서 한 CPU가 동작시키는 메모리 오퍼레이션이 만들어내는
+변화는 해당 오퍼레이션이 CPU 와 시스템의 다른 부분들 사이의 인터페이스(점선)를
+지나가면서 시스템의 나머지 부분들에 인지됩니다.
+
+
+예를 들어, 다음의 일련의 이벤트들을 생각해 봅시다:
+
+       CPU 1           CPU 2
+       =============== ===============
+       { A == 1; B == 2 }
+       A = 3;          x = B;
+       B = 4;          y = A;
+
+다이어그램의 가운데에 위치한 메모리 시스템에 보여지게 되는 액세스들은 다음의 총
+24개의 조합으로 재구성될 수 있습니다:
+
+       STORE A=3,      STORE B=4,      y=LOAD A->3,    x=LOAD B->4
+       STORE A=3,      STORE B=4,      x=LOAD B->4,    y=LOAD A->3
+       STORE A=3,      y=LOAD A->3,    STORE B=4,      x=LOAD B->4
+       STORE A=3,      y=LOAD A->3,    x=LOAD B->2,    STORE B=4
+       STORE A=3,      x=LOAD B->2,    STORE B=4,      y=LOAD A->3
+       STORE A=3,      x=LOAD B->2,    y=LOAD A->3,    STORE B=4
+       STORE B=4,      STORE A=3,      y=LOAD A->3,    x=LOAD B->4
+       STORE B=4, ...
+       ...
+
+따라서 다음의 네가지 조합의 값들이 나올 수 있습니다:
+
+       x == 2, y == 1
+       x == 2, y == 3
+       x == 4, y == 1
+       x == 4, y == 3
+
+
+한발 더 나아가서, 한 CPU 가 메모리 시스템에 반영한 스토어 오퍼레이션들의 결과는
+다른 CPU 에서의 로드 오퍼레이션을 통해 인지되는데, 이 때 스토어가 반영된 순서와
+다른 순서로 인지될 수도 있습니다.
+
+
+예로, 아래의 일련의 이벤트들을 생각해 봅시다:
+
+       CPU 1           CPU 2
+       =============== ===============
+       { A == 1, B == 2, C == 3, P == &A, Q == &C }
+       B = 4;          Q = P;
+       P = &B          D = *Q;
+
+D 로 읽혀지는 값은 CPU 2 에서 P 로부터 읽혀진 주소값에 의존적이기 때문에 여기엔
+분명한 데이터 의존성이 있습니다.  하지만 이 이벤트들의 실행 결과로는 아래의
+결과들이 모두 나타날 수 있습니다:
+
+       (Q == &A) and (D == 1)
+       (Q == &B) and (D == 2)
+       (Q == &B) and (D == 4)
+
+CPU 2 는 *Q 의 로드를 요청하기 전에 P 를 Q 에 넣기 때문에 D 에 C 를 집어넣는
+일은 없음을 알아두세요.
+
+
+디바이스 오퍼레이션
+-------------------
+
+일부 디바이스는 자신의 컨트롤 인터페이스를 메모리의 특정 영역으로 매핑해서
+제공하는데(Memory mapped I/O), 해당 컨트롤 레지스터에 접근하는 순서는 매우
+중요합니다.  예를 들어, 어드레스 포트 레지스터 (A) 와 데이터 포트 레지스터 (D)
+를 통해 접근되는 내부 레지스터 집합을 갖는 이더넷 카드를 생각해 봅시다.  내부의
+5번 레지스터를 읽기 위해 다음의 코드가 사용될 수 있습니다:
+
+       *A = 5;
+       x = *D;
+
+하지만, 이건 다음의 두 조합 중 하나로 만들어질 수 있습니다:
+
+       STORE *A = 5, x = LOAD *D
+       x = LOAD *D, STORE *A = 5
+
+두번째 조합은 데이터를 읽어온 _후에_ 주소를 설정하므로, 오동작을 일으킬 겁니다.
+
+
+보장사항
+--------
+
+CPU 에게 기대할 수 있는 최소한의 보장사항 몇가지가 있습니다:
+
+ (*) 어떤 CPU 든, 의존성이 존재하는 메모리 액세스들은 해당 CPU 자신에게
+     있어서는 순서대로 메모리 시스템에 수행 요청됩니다. 즉, 다음에 대해서:
+
+       Q = READ_ONCE(P); smp_read_barrier_depends(); D = READ_ONCE(*Q);
+
+     CPU 는 다음과 같은 메모리 오퍼레이션 시퀀스를 수행 요청합니다:
+
+       Q = LOAD P, D = LOAD *Q
+
+     그리고 그 시퀀스 내에서의 순서는 항상 지켜집니다.  대부분의 시스템에서
+     smp_read_barrier_depends() 는 아무일도 안하지만 DEC Alpha 에서는
+     명시적으로 사용되어야 합니다.  보통의 경우에는 smp_read_barrier_depends()
+     를 직접 사용하는 대신 rcu_dereference() 같은 것들을 사용해야 함을
+     알아두세요.
+
+ (*) 특정 CPU 내에서 겹치는 영역의 메모리에 행해지는 로드와 스토어 들은 해당
+     CPU 안에서는 순서가 바뀌지 않은 것으로 보여집니다.  즉, 다음에 대해서:
+
+       a = READ_ONCE(*X); WRITE_ONCE(*X, b);
+
+     CPU 는 다음의 메모리 오퍼레이션 시퀀스만을 메모리에 요청할 겁니다:
+
+       a = LOAD *X, STORE *X = b
+
+     그리고 다음에 대해서는:
+
+       WRITE_ONCE(*X, c); d = READ_ONCE(*X);
+
+     CPU 는 다음의 수행 요청만을 만들어 냅니다:
+
+       STORE *X = c, d = LOAD *X
+
+     (로드 오퍼레이션과 스토어 오퍼레이션이 겹치는 메모리 영역에 대해
+     수행된다면 해당 오퍼레이션들은 겹친다고 표현됩니다).
+
+그리고 _반드시_ 또는 _절대로_ 가정하거나 가정하지 말아야 하는 것들이 있습니다:
+
+ (*) 컴파일러가 READ_ONCE() 나 WRITE_ONCE() 로 보호되지 않은 메모리 액세스를
+     당신이 원하는 대로 할 것이라는 가정은 _절대로_ 해선 안됩니다.  그것들이
+     없다면, 컴파일러는 컴파일러 배리어 섹션에서 다루게 될, 모든 "창의적인"
+     변경들을 만들어낼 권한을 갖게 됩니다.
+
+ (*) 개별적인 로드와 스토어들이 주어진 순서대로 요청될 것이라는 가정은 _절대로_
+     하지 말아야 합니다.  이 말은 곧:
+
+       X = *A; Y = *B; *D = Z;
+
+     는 다음의 것들 중 어느 것으로든 만들어질 수 있다는 의미입니다:
+
+       X = LOAD *A,  Y = LOAD *B,  STORE *D = Z
+       X = LOAD *A,  STORE *D = Z, Y = LOAD *B
+       Y = LOAD *B,  X = LOAD *A,  STORE *D = Z
+       Y = LOAD *B,  STORE *D = Z, X = LOAD *A
+       STORE *D = Z, X = LOAD *A,  Y = LOAD *B
+       STORE *D = Z, Y = LOAD *B,  X = LOAD *A
+
+ (*) 겹치는 메모리 액세스들은 합쳐지거나 버려질 수 있음을 _반드시_ 가정해야
+     합니다.  다음의 코드는:
+
+       X = *A; Y = *(A + 4);
+
+     다음의 것들 중 뭐든 될 수 있습니다:
+
+       X = LOAD *A; Y = LOAD *(A + 4);
+       Y = LOAD *(A + 4); X = LOAD *A;
+       {X, Y} = LOAD {*A, *(A + 4) };
+
+     그리고:
+
+       *A = X; *(A + 4) = Y;
+
+     는 다음 중 뭐든 될 수 있습니다:
+
+       STORE *A = X; STORE *(A + 4) = Y;
+       STORE *(A + 4) = Y; STORE *A = X;
+       STORE {*A, *(A + 4) } = {X, Y};
+
+그리고 보장사항에 반대되는 것들(anti-guarantees)이 있습니다:
+
+ (*) 이 보장사항들은 bitfield 에는 적용되지 않는데, 컴파일러들은 bitfield 를
+     수정하는 코드를 생성할 때 원자성 없는(non-atomic) 읽고-수정하고-쓰는
+     인스트럭션들의 조합을 만드는 경우가 많기 때문입니다.  병렬 알고리즘의
+     동기화에 bitfield 를 사용하려 하지 마십시오.
+
+ (*) bitfield 들이 여러 락으로 보호되는 경우라 하더라도, 하나의 bitfield 의
+     모든 필드들은 하나의 락으로 보호되어야 합니다.  만약 한 bitfield 의 두
+     필드가 서로 다른 락으로 보호된다면, 컴파일러의 원자성 없는
+     읽고-수정하고-쓰는 인스트럭션 조합은 한 필드에의 업데이트가 근처의
+     필드에도 영향을 끼치게 할 수 있습니다.
+
+ (*) 이 보장사항들은 적절하게 정렬되고 크기가 잡힌 스칼라 변수들에 대해서만
+     적용됩니다.  "적절하게 크기가 잡힌" 이라함은 현재로써는 "char", "short",
+     "int" 그리고 "long" 과 같은 크기의 변수들을 의미합니다.  "적절하게 정렬된"
+     은 자연스런 정렬을 의미하는데, 따라서 "char" 에 대해서는 아무 제약이 없고,
+     "short" 에 대해서는 2바이트 정렬을, "int" 에는 4바이트 정렬을, 그리고
+     "long" 에 대해서는 32-bit 시스템인지 64-bit 시스템인지에 따라 4바이트 또는
+     8바이트 정렬을 의미합니다.  이 보장사항들은 C11 표준에서 소개되었으므로,
+     C11 전의 오래된 컴파일러(예를 들어, gcc 4.6) 를 사용할 때엔 주의하시기
+     바랍니다.  표준에 이 보장사항들은 "memory location" 을 정의하는 3.14
+     섹션에 다음과 같이 설명되어 있습니다:
+     (역자: 인용문이므로 번역하지 않습니다)
+
+       memory location
+               either an object of scalar type, or a maximal sequence
+               of adjacent bit-fields all having nonzero width
+
+               NOTE 1: Two threads of execution can update and access
+               separate memory locations without interfering with
+               each other.
+
+               NOTE 2: A bit-field and an adjacent non-bit-field member
+               are in separate memory locations. The same applies
+               to two bit-fields, if one is declared inside a nested
+               structure declaration and the other is not, or if the two
+               are separated by a zero-length bit-field declaration,
+               or if they are separated by a non-bit-field member
+               declaration. It is not safe to concurrently update two
+               bit-fields in the same structure if all members declared
+               between them are also bit-fields, no matter what the
+               sizes of those intervening bit-fields happen to be.
+
+
+=========================
+메모리 배리어란 무엇인가?
+=========================
+
+앞에서 봤듯이, 상호간 의존성이 없는 메모리 오퍼레이션들은 실제로는 무작위적
+순서로 수행될 수 있으며, 이는 CPU 와 CPU 간의 상호작용이나 I/O 에 문제가 될 수
+있습니다.  따라서 컴파일러와 CPU 가 순서를 바꾸는데 제약을 걸 수 있도록 개입할
+수 있는 어떤 방법이 필요합니다.
+
+메모리 배리어는 그런 개입 수단입니다.  메모리 배리어는 배리어를 사이에 둔 앞과
+뒤 양측의 메모리 오퍼레이션들 간에 부분적 순서가 존재하도록 하는 효과를 줍니다.
+
+시스템의 CPU 들과 여러 디바이스들은 성능을 올리기 위해 명령어 재배치, 실행
+유예, 메모리 오퍼레이션들의 조합, 예측적 로드(speculative load), 브랜치
+예측(speculative branch prediction), 다양한 종류의 캐싱(caching) 등의 다양한
+트릭을 사용할 수 있기 때문에 이런 강제력은 중요합니다.  메모리 배리어들은 이런
+트릭들을 무효로 하거나 억제하는 목적으로 사용되어져서 코드가 여러 CPU 와
+디바이스들 간의 상호작용을 정상적으로 제어할 수 있게 해줍니다.
+
+
+메모리 배리어의 종류
+--------------------
+
+메모리 배리어는 네개의 기본 타입으로 분류됩니다:
+
+ (1) 쓰기 (또는 스토어) 메모리 배리어.
+
+     쓰기 메모리 배리어는 시스템의 다른 컴포넌트들에 해당 배리어보다 앞서
+     명시된 모든 STORE 오퍼레이션들이 해당 배리어 뒤에 명시된 모든 STORE
+     오퍼레이션들보다 먼저 수행된 것으로 보일 것을 보장합니다.
+
+     쓰기 배리어는 스토어 오퍼레이션들에 대한 부분적 순서 세우기입니다; 로드
+     오퍼레이션들에 대해서는 어떤 영향도 끼치지 않습니다.
+
+     CPU 는 시간의 흐름에 따라 메모리 시스템에 일련의 스토어 오퍼레이션들을
+     하나씩 요청해 집어넣습니다.  쓰기 배리어 앞의 모든 스토어 오퍼레이션들은
+     쓰기 배리어 뒤의 모든 스토어 오퍼레이션들보다 _앞서_ 수행될 겁니다.
+
+     [!] 쓰기 배리어들은 읽기 또는 데이터 의존성 배리어와 함께 짝을 맞춰
+     사용되어야만 함을 알아두세요; "SMP 배리어 짝맞추기" 서브섹션을 참고하세요.
+
+
+ (2) 데이터 의존성 배리어.
+
+     데이터 의존성 배리어는 읽기 배리어의 보다 완화된 형태입니다.  두개의 로드
+     오퍼레이션이 있고 두번째 것이 첫번째 것의 결과에 의존하고 있을 때(예:
+     두번째 로드가 참조할 주소를 첫번째 로드가 읽는 경우), 두번째 로드가 읽어올
+     데이터는 첫번째 로드에 의해 그 주소가 얻어지기 전에 업데이트 되어 있음을
+     보장하기 위해서 데이터 의존성 배리어가 필요할 수 있습니다.
+
+     데이터 의존성 배리어는 상호 의존적인 로드 오퍼레이션들 사이의 부분적 순서
+     세우기입니다; 스토어 오퍼레이션들이나 독립적인 로드들, 또는 중복되는
+     로드들에 대해서는 어떤 영향도 끼치지 않습니다.
+
+     (1) 에서 언급했듯이, 시스템의 CPU 들은 메모리 시스템에 일련의 스토어
+     오퍼레이션들을 던져 넣고 있으며, 거기에 관심이 있는 다른 CPU 는 그
+     오퍼레이션들을 메모리 시스템이 실행한 결과를 인지할 수 있습니다.  이처럼
+     다른 CPU 의 스토어 오퍼레이션의 결과에 관심을 두고 있는 CPU 가 수행 요청한
+     데이터 의존성 배리어는, 배리어 앞의 어떤 로드 오퍼레이션이 다른 CPU 에서
+     던져 넣은 스토어 오퍼레이션과 같은 영역을 향했다면, 그런 스토어
+     오퍼레이션들이 만들어내는 결과가 데이터 의존성 배리어 뒤의 로드
+     오퍼레이션들에게는 보일 것을 보장합니다.
+
+     이 순서 세우기 제약에 대한 그림을 보기 위해선 "메모리 배리어 시퀀스의 예"
+     서브섹션을 참고하시기 바랍니다.
+
+     [!] 첫번째 로드는 반드시 _데이터_ 의존성을 가져야지 컨트롤 의존성을 가져야
+     하는게 아님을 알아두십시오.  만약 두번째 로드를 위한 주소가 첫번째 로드에
+     의존적이지만 그 의존성은 조건적이지 그 주소 자체를 가져오는게 아니라면,
+     그것은 _컨트롤_ 의존성이고, 이 경우에는 읽기 배리어나 그보다 강력한
+     무언가가 필요합니다.  더 자세한 내용을 위해서는 "컨트롤 의존성" 서브섹션을
+     참고하시기 바랍니다.
+
+     [!] 데이터 의존성 배리어는 보통 쓰기 배리어들과 함께 짝을 맞춰 사용되어야
+     합니다; "SMP 배리어 짝맞추기" 서브섹션을 참고하세요.
+
+
+ (3) 읽기 (또는 로드) 메모리 배리어.
+
+     읽기 배리어는 데이터 의존성 배리어 기능의 보장사항에 더해서 배리어보다
+     앞서 명시된 모든 LOAD 오퍼레이션들이 배리어 뒤에 명시되는 모든 LOAD
+     오퍼레이션들보다 먼저 행해진 것으로 시스템의 다른 컴포넌트들에 보여질 것을
+     보장합니다.
+
+     읽기 배리어는 로드 오퍼레이션에 행해지는 부분적 순서 세우기입니다; 스토어
+     오퍼레이션에 대해서는 어떤 영향도 끼치지 않습니다.
+
+     읽기 메모리 배리어는 데이터 의존성 배리어를 내장하므로 데이터 의존성
+     배리어를 대신할 수 있습니다.
+
+     [!] 읽기 배리어는 일반적으로 쓰기 배리어들과 함께 짝을 맞춰 사용되어야
+     합니다; "SMP 배리어 짝맞추기" 서브섹션을 참고하세요.
+
+
+ (4) 범용 메모리 배리어.
+
+     범용(general) 메모리 배리어는 배리어보다 앞서 명시된 모든 LOAD 와 STORE
+     오퍼레이션들이 배리어 뒤에 명시된 모든 LOAD 와 STORE 오퍼레이션들보다
+     먼저 수행된 것으로 시스템의 나머지 컴포넌트들에 보이게 됨을 보장합니다.
+
+     범용 메모리 배리어는 로드와 스토어 모두에 대한 부분적 순서 세우기입니다.
+
+     범용 메모리 배리어는 읽기 메모리 배리어, 쓰기 메모리 배리어 모두를
+     내장하므로, 두 배리어를 모두 대신할 수 있습니다.
+
+
+그리고 두개의 명시적이지 않은 타입이 있습니다:
+
+ (5) ACQUIRE 오퍼레이션.
+
+     이 타입의 오퍼레이션은 단방향의 투과성 배리어처럼 동작합니다.  ACQUIRE
+     오퍼레이션 뒤의 모든 메모리 오퍼레이션들이 ACQUIRE 오퍼레이션 후에
+     일어난 것으로 시스템의 나머지 컴포넌트들에 보이게 될 것이 보장됩니다.
+     LOCK 오퍼레이션과 smp_load_acquire(), smp_cond_acquire() 오퍼레이션도
+     ACQUIRE 오퍼레이션에 포함됩니다.  smp_cond_acquire() 오퍼레이션은 컨트롤
+     의존성과 smp_rmb() 를 사용해서 ACQUIRE 의 의미적 요구사항(semantic)을
+     충족시킵니다.
+
+     ACQUIRE 오퍼레이션 앞의 메모리 오퍼레이션들은 ACQUIRE 오퍼레이션 완료 후에
+     수행된 것처럼 보일 수 있습니다.
+
+     ACQUIRE 오퍼레이션은 거의 항상 RELEASE 오퍼레이션과 짝을 지어 사용되어야
+     합니다.
+
+
+ (6) RELEASE 오퍼레이션.
+
+     이 타입의 오퍼레이션들도 단방향 투과성 배리어처럼 동작합니다.  RELEASE
+     오퍼레이션 앞의 모든 메모리 오퍼레이션들은 RELEASE 오퍼레이션 전에 완료된
+     것으로 시스템의 다른 컴포넌트들에 보여질 것이 보장됩니다.  UNLOCK 류의
+     오퍼레이션들과 smp_store_release() 오퍼레이션도 RELEASE 오퍼레이션의
+     일종입니다.
+
+     RELEASE 오퍼레이션 뒤의 메모리 오퍼레이션들은 RELEASE 오퍼레이션이
+     완료되기 전에 행해진 것처럼 보일 수 있습니다.
+
+     ACQUIRE 와 RELEASE 오퍼레이션의 사용은 일반적으로 다른 메모리 배리어의
+     필요성을 없앱니다 (하지만 "MMIO 쓰기 배리어" 서브섹션에서 설명되는 예외를
+     알아두세요).  또한, RELEASE+ACQUIRE 조합은 범용 메모리 배리어처럼 동작할
+     것을 보장하지 -않습니다-.  하지만, 어떤 변수에 대한 RELEASE 오퍼레이션을
+     앞서는 메모리 액세스들의 수행 결과는 이 RELEASE 오퍼레이션을 뒤이어 같은
+     변수에 대해 수행된 ACQUIRE 오퍼레이션을 뒤따르는 메모리 액세스에는 보여질
+     것이 보장됩니다.  다르게 말하자면, 주어진 변수의 크리티컬 섹션에서는, 해당
+     변수에 대한 앞의 크리티컬 섹션에서의 모든 액세스들이 완료되었을 것을
+     보장합니다.
+
+     즉, ACQUIRE 는 최소한의 "취득" 동작처럼, 그리고 RELEASE 는 최소한의 "공개"
+     처럼 동작한다는 의미입니다.
+
+atomic_ops.txt 에서 설명되는 어토믹 오퍼레이션들 중에는 완전히 순서잡힌 것들과
+(배리어를 사용하지 않는) 완화된 순서의 것들 외에 ACQUIRE 와 RELEASE 부류의
+것들도 존재합니다.  로드와 스토어를 모두 수행하는 조합된 어토믹 오퍼레이션에서,
+ACQUIRE 는 해당 오퍼레이션의 로드 부분에만 적용되고 RELEASE 는 해당
+오퍼레이션의 스토어 부분에만 적용됩니다.
+
+메모리 배리어들은 두 CPU 간, 또는 CPU 와 디바이스 간에 상호작용의 가능성이 있을
+때에만 필요합니다.  만약 어떤 코드에 그런 상호작용이 없을 것이 보장된다면, 해당
+코드에서는 메모리 배리어를 사용할 필요가 없습니다.
+
+
+이것들은 _최소한의_ 보장사항들임을 알아두세요.  다른 아키텍쳐에서는 더 강력한
+보장사항을 제공할 수도 있습니다만, 그런 보장사항은 아키텍쳐 종속적 코드 이외의
+부분에서는 신뢰되지 _않을_ 겁니다.
+
+
+메모리 배리어에 대해 가정해선 안될 것
+-------------------------------------
+
+리눅스 커널 메모리 배리어들이 보장하지 않는 것들이 있습니다:
+
+ (*) 메모리 배리어 앞에서 명시된 어떤 메모리 액세스도 메모리 배리어 명령의 수행
+     완료 시점까지 _완료_ 될 것이란 보장은 없습니다; 배리어가 하는 일은 CPU 의
+     액세스 큐에 특정 타입의 액세스들은 넘을 수 없는 선을 긋는 것으로 생각될 수
+     있습니다.
+
+ (*) 한 CPU 에서 메모리 배리어를 수행하는게 시스템의 다른 CPU 나 하드웨어에
+     어떤 직접적인 영향을 끼친다는 보장은 존재하지 않습니다.  배리어 수행이
+     만드는 간접적 영향은 두번째 CPU 가 첫번째 CPU 의 액세스들의 결과를
+     바라보는 순서가 됩니다만, 다음 항목을 보세요:
+
+ (*) 첫번째 CPU 가 두번째 CPU 의 메모리 액세스들의 결과를 바라볼 때, _설령_
+     두번째 CPU 가 메모리 배리어를 사용한다 해도, 첫번째 CPU _또한_ 그에 맞는
+     메모리 배리어를 사용하지 않는다면 ("SMP 배리어 짝맞추기" 서브섹션을
+     참고하세요) 그 결과가 올바른 순서로 보여진다는 보장은 없습니다.
+
+ (*) CPU 바깥의 하드웨어[*] 가 메모리 액세스들의 순서를 바꾸지 않는다는 보장은
+     존재하지 않습니다.  CPU 캐시 일관성 메커니즘은 메모리 배리어의 간접적
+     영향을 CPU 사이에 전파하긴 하지만, 순서대로 전파하지는 않을 수 있습니다.
+
+       [*] 버스 마스터링 DMA 와 일관성에 대해서는 다음을 참고하시기 바랍니다:
+
+           Documentation/PCI/pci.txt
+           Documentation/DMA-API-HOWTO.txt
+           Documentation/DMA-API.txt
+
+
+데이터 의존성 배리어
+--------------------
+
+데이터 의존성 배리어의 사용에 있어 지켜야 하는 사항들은 약간 미묘하고, 데이터
+의존성 배리어가 사용되어야 하는 상황도 항상 명백하지는 않습니다.  설명을 위해
+다음의 이벤트 시퀀스를 생각해 봅시다:
+
+       CPU 1                 CPU 2
+       ===============       ===============
+       { A == 1, B == 2, C == 3, P == &A, Q == &C }
+       B = 4;
+       <쓰기 배리어>
+       WRITE_ONCE(P, &B)
+                             Q = READ_ONCE(P);
+                             D = *Q;
+
+여기엔 분명한 데이터 의존성이 존재하므로, 이 시퀀스가 끝났을 때 Q 는 &A 또는 &B
+일 것이고, 따라서:
+
+       (Q == &A) 는 (D == 1) 를,
+       (Q == &B) 는 (D == 4) 를 의미합니다.
+
+하지만!  CPU 2 는 B 의 업데이트를 인식하기 전에 P 의 업데이트를 인식할 수 있고,
+따라서 다음의 결과가 가능합니다:
+
+       (Q == &B) and (D == 2) ????
+
+이런 결과는 일관성이나 인과 관계 유지가 실패한 것처럼 보일 수도 있겠지만,
+그렇지 않습니다, 그리고 이 현상은 (DEC Alpha 와 같은) 여러 CPU 에서 실제로
+발견될 수 있습니다.
+
+이 문제 상황을 제대로 해결하기 위해, 데이터 의존성 배리어나 그보다 강화된
+무언가가 주소를 읽어올 때와 데이터를 읽어올 때 사이에 추가되어야만 합니다:
+
+       CPU 1                 CPU 2
+       ===============       ===============
+       { A == 1, B == 2, C == 3, P == &A, Q == &C }
+       B = 4;
+       <쓰기 배리어>
+       WRITE_ONCE(P, &B);
+                             Q = READ_ONCE(P);
+                             <데이터 의존성 배리어>
+                             D = *Q;
+
+이 변경은 앞의 처음 두가지 결과 중 하나만이 발생할 수 있고, 세번째의 결과는
+발생할 수 없도록 합니다.
+
+데이터 의존성 배리어는 의존적 쓰기에 대해서도 순서를 잡아줍니다:
+
+       CPU 1                 CPU 2
+       ===============       ===============
+       { A == 1, B == 2, C = 3, P == &A, Q == &C }
+       B = 4;
+       <쓰기 배리어>
+       WRITE_ONCE(P, &B);
+                             Q = READ_ONCE(P);
+                             <데이터 의존성 배리어>
+                             *Q = 5;
+
+이 데이터 의존성 배리어는 Q 로의 읽기가 *Q 로의 스토어와 순서를 맞추게
+해줍니다.  이는 다음과 같은 결과를 막습니다:
+
+       (Q == &B) && (B == 4)
+
+이런 패턴은 드물게 사용되어야 함을 알아 두시기 바랍니다.  무엇보다도, 의존성
+순서 규칙의 의도는 쓰기 작업을 -예방- 해서 그로 인해 발생하는 비싼 캐시 미스도
+없애려는 것입니다.  이 패턴은 드물게 발생하는 에러 조건 같은것들을 기록하는데
+사용될 수 있고, 이렇게 배리어를 사용해 순서를 지키게 함으로써 그런 기록이
+사라지는 것을 막습니다.
+
+
+[!] 상당히 비직관적인 이 상황은 분리된 캐시를 가진 기계, 예를 들어 한 캐시
+뱅크가 짝수번 캐시 라인을 처리하고 다른 뱅크는 홀수번 캐시 라인을 처리하는 기계
+등에서 가장 잘 발생합니다.  포인터 P 는 홀수 번호의 캐시 라인에 있고, 변수 B 는
+짝수 번호 캐시 라인에 있다고 생각해 봅시다.  그런 상태에서 읽기 작업을 하는 CPU
+의 짝수번 뱅크는 할 일이 쌓여 매우 바쁘지만 홀수번 뱅크는 할 일이 없어 아무
+일도 하지 않고  있었다면, 포인터 P 는 새 값 (&B) 을, 그리고 변수 B 는 옛날 값
+(2) 을 가지고 있는 상태가 보여질 수도 있습니다.
+
+
+데이터 의존성 배리어는 매우 중요한데, 예를 들어 RCU 시스템에서 그렇습니다.
+include/linux/rcupdate.h 의 rcu_assign_pointer() 와 rcu_dereference() 를
+참고하세요.  여기서 데이터 의존성 배리어는 RCU 로 관리되는 포인터의 타겟을 현재
+타겟에서 수정된 새로운 타겟으로 바꾸는 작업에서 새로 수정된 타겟이 초기화가
+완료되지 않은 채로 보여지는 일이 일어나지 않게 해줍니다.
+
+더 많은 예를 위해선 "캐시 일관성" 서브섹션을 참고하세요.
+
+
+컨트롤 의존성
+-------------
+
+로드-로드 컨트롤 의존성은 데이터 의존성 배리어만으로는 정확히 동작할 수가
+없어서 읽기 메모리 배리어를 필요로 합니다.  아래의 코드를 봅시다:
+
+       q = READ_ONCE(a);
+       if (q) {
+               <데이터 의존성 배리어>  /* BUG: No data dependency!!! */
+               p = READ_ONCE(b);
+       }
+
+이 코드는 원하는 대로의 효과를 내지 못할 수 있는데, 이 코드에는 데이터 의존성이
+아니라 컨트롤 의존성이 존재하기 때문으로, 이런 상황에서 CPU 는 실행 속도를 더
+빠르게 하기 위해 분기 조건의 결과를 예측하고 코드를 재배치 할 수 있어서 다른
+CPU 는 b 로부터의 로드 오퍼레이션이 a 로부터의 로드 오퍼레이션보다 먼저 발생한
+걸로 인식할 수 있습니다.  여기에 정말로 필요했던 건 다음과 같습니다:
+
+       q = READ_ONCE(a);
+       if (q) {
+               <읽기 배리어>
+               p = READ_ONCE(b);
+       }
+
+하지만, 스토어 오퍼레이션은 예측적으로 수행되지 않습니다.  즉, 다음 예에서와
+같이 로드-스토어 컨트롤 의존성이 존재하는 경우에는 순서가 -지켜진다-는
+의미입니다.
+
+       q = READ_ONCE(a);
+       if (q) {
+               WRITE_ONCE(b, p);
+       }
+
+컨트롤 의존성은 보통 다른 타입의 배리어들과 짝을 맞춰 사용됩니다.  그렇다곤
+하나, READ_ONCE() 는 반드시 사용해야 함을 부디 명심하세요!  READ_ONCE() 가
+없다면, 컴파일러가 'a' 로부터의 로드를 'a' 로부터의 또다른 로드와, 'b' 로의
+스토어를 'b' 로의 또다른 스토어와 조합해 버려 매우 비직관적인 결과를 초래할 수
+있습니다.
+
+이걸로 끝이 아닌게, 컴파일러가 변수 'a' 의 값이 항상 0이 아니라고 증명할 수
+있다면, 앞의 예에서 "if" 문을 없애서 다음과 같이 최적화 할 수도 있습니다:
+
+       q = a;
+       b = p;  /* BUG: Compiler and CPU can both reorder!!! */
+
+그러니 READ_ONCE() 를 반드시 사용하세요.
+
+다음과 같이 "if" 문의 양갈래 브랜치에 모두 존재하는 동일한 스토어에 대해 순서를
+강제하고 싶은 경우가 있을 수 있습니다:
+
+       q = READ_ONCE(a);
+       if (q) {
+               barrier();
+               WRITE_ONCE(b, p);
+               do_something();
+       } else {
+               barrier();
+               WRITE_ONCE(b, p);
+               do_something_else();
+       }
+
+안타깝게도, 현재의 컴파일러들은 높은 최적화 레벨에서는 이걸 다음과 같이
+바꿔버립니다:
+
+       q = READ_ONCE(a);
+       barrier();
+       WRITE_ONCE(b, p);  /* BUG: No ordering vs. load from a!!! */
+       if (q) {
+               /* WRITE_ONCE(b, p); -- moved up, BUG!!! */
+               do_something();
+       } else {
+               /* WRITE_ONCE(b, p); -- moved up, BUG!!! */
+               do_something_else();
+       }
+
+이제 'a' 에서의 로드와 'b' 로의 스토어 사이에는 조건적 관계가 없기 때문에 CPU
+는 이들의 순서를 바꿀 수 있게 됩니다: 이런 경우에 조건적 관계는 반드시
+필요한데, 모든 컴파일러 최적화가 이루어지고 난 후의 어셈블리 코드에서도
+마찬가지입니다.  따라서, 이 예에서 순서를 지키기 위해서는 smp_store_release()
+와 같은 명시적 메모리 배리어가 필요합니다:
+
+       q = READ_ONCE(a);
+       if (q) {
+               smp_store_release(&b, p);
+               do_something();
+       } else {
+               smp_store_release(&b, p);
+               do_something_else();
+       }
+
+반면에 명시적 메모리 배리어가 없다면, 이런 경우의 순서는 스토어 오퍼레이션들이
+서로 다를 때에만 보장되는데, 예를 들면 다음과 같은 경우입니다:
+
+       q = READ_ONCE(a);
+       if (q) {
+               WRITE_ONCE(b, p);
+               do_something();
+       } else {
+               WRITE_ONCE(b, r);
+               do_something_else();
+       }
+
+처음의 READ_ONCE() 는 컴파일러가 'a' 의 값을 증명해내는 것을 막기 위해 여전히
+필요합니다.
+
+또한, 로컬 변수 'q' 를 가지고 하는 일에 대해 주의해야 하는데, 그러지 않으면
+컴파일러는 그 값을 추측하고 또다시 필요한 조건관계를 없애버릴 수 있습니다.
+예를 들면:
+
+       q = READ_ONCE(a);
+       if (q % MAX) {
+               WRITE_ONCE(b, p);
+               do_something();
+       } else {
+               WRITE_ONCE(b, r);
+               do_something_else();
+       }
+
+만약 MAX 가 1 로 정의된 상수라면, 컴파일러는 (q % MAX) 는 0이란 것을 알아채고,
+위의 코드를 아래와 같이 바꿔버릴 수 있습니다:
+
+       q = READ_ONCE(a);
+       WRITE_ONCE(b, p);
+       do_something_else();
+
+이렇게 되면, CPU 는 변수 'a' 로부터의 로드와 변수 'b' 로의 스토어 사이의 순서를
+지켜줄 필요가 없어집니다.  barrier() 를 추가해 해결해 보고 싶겠지만, 그건
+도움이 안됩니다.  조건 관계는 사라졌고, barrier() 는 이를 되돌리지 못합니다.
+따라서, 이 순서를 지켜야 한다면, MAX 가 1 보다 크다는 것을, 다음과 같은 방법을
+사용해 분명히 해야 합니다:
+
+       q = READ_ONCE(a);
+       BUILD_BUG_ON(MAX <= 1); /* Order load from a with store to b. */
+       if (q % MAX) {
+               WRITE_ONCE(b, p);
+               do_something();
+       } else {
+               WRITE_ONCE(b, r);
+               do_something_else();
+       }
+
+'b' 로의 스토어들은 여전히 서로 다름을 알아두세요.  만약 그것들이 동일하면,
+앞에서 이야기했듯, 컴파일러가 그 스토어 오퍼레이션들을 'if' 문 바깥으로
+끄집어낼 수 있습니다.
+
+또한 이진 조건문 평가에 너무 의존하지 않도록 조심해야 합니다.  다음의 예를
+봅시다:
+
+       q = READ_ONCE(a);
+       if (q || 1 > 0)
+               WRITE_ONCE(b, 1);
+
+첫번째 조건만으로는 브랜치 조건 전체를 거짓으로 만들 수 없고 두번째 조건은 항상
+참이기 때문에, 컴파일러는 이 예를 다음과 같이 바꿔서 컨트롤 의존성을 없애버릴
+수 있습니다:
+
+       q = READ_ONCE(a);
+       WRITE_ONCE(b, 1);
+
+이 예는 컴파일러가 코드를 추측으로 수정할 수 없도록 분명히 해야 한다는 점을
+강조합니다.  조금 더 일반적으로 말해서, READ_ONCE() 는 컴파일러에게 주어진 로드
+오퍼레이션을 위한 코드를 정말로 만들도록 하지만, 컴파일러가 그렇게 만들어진
+코드의 수행 결과를 사용하도록 강제하지는 않습니다.
+
+마지막으로, 컨트롤 의존성은 이행성 (transitivity) 을 제공하지 -않습니다-.  이건
+x 와 y 가 둘 다 0 이라는 초기값을 가졌다는 가정 하의 두개의 예제로
+보이겠습니다:
+
+       CPU 0                     CPU 1
+       =======================   =======================
+       r1 = READ_ONCE(x);        r2 = READ_ONCE(y);
+       if (r1 > 0)               if (r2 > 0)
+         WRITE_ONCE(y, 1);         WRITE_ONCE(x, 1);
+
+       assert(!(r1 == 1 && r2 == 1));
+
+이 두 CPU 예제에서 assert() 의 조건은 항상 참일 것입니다.  그리고, 만약 컨트롤
+의존성이 이행성을 (실제로는 그러지 않지만) 보장한다면, 다음의 CPU 가 추가되어도
+아래의 assert() 조건은 참이 될것입니다:
+
+       CPU 2
+       =====================
+       WRITE_ONCE(x, 2);
+
+       assert(!(r1 == 2 && r2 == 1 && x == 2)); /* FAILS!!! */
+
+하지만 컨트롤 의존성은 이행성을 제공하지 -않기- 때문에, 세개의 CPU 예제가 실행
+완료된 후에 위의 assert() 의 조건은 거짓으로 평가될 수 있습니다.  세개의 CPU
+예제가 순서를 지키길 원한다면, CPU 0 와 CPU 1 코드의 로드와 스토어 사이, "if"
+문 바로 다음에 smp_mb()를 넣어야 합니다.  더 나아가서, 최초의 두 CPU 예제는
+매우 위험하므로 사용되지 않아야 합니다.
+
+이 두개의 예제는 다음 논문:
+http://www.cl.cam.ac.uk/users/pes20/ppc-supplemental/test6.pdf 와
+이 사이트: https://www.cl.cam.ac.uk/~pes20/ppcmem/index.html 에 나온 LB 와 WWC
+리트머스 테스트입니다.
+
+요약하자면:
+
+  (*) 컨트롤 의존성은 앞의 로드들을 뒤의 스토어들에 대해 순서를 맞춰줍니다.
+      하지만, 그 외의 어떤 순서도 보장하지 -않습니다-: 앞의 로드와 뒤의 로드들
+      사이에도, 앞의 스토어와 뒤의 스토어들 사이에도요.  이런 다른 형태의
+      순서가 필요하다면 smp_rmb() 나 smp_wmb()를, 또는, 앞의 스토어들과 뒤의
+      로드들 사이의 순서를 위해서는 smp_mb() 를 사용하세요.
+
+  (*) "if" 문의 양갈래 브랜치가 같은 변수에의 동일한 스토어로 시작한다면, 그
+      스토어들은 각 스토어 앞에 smp_mb() 를 넣거나 smp_store_release() 를
+      사용해서 스토어를 하는 식으로 순서를 맞춰줘야 합니다.  이 문제를 해결하기
+      위해 "if" 문의 양갈래 브랜치의 시작 지점에 barrier() 를 넣는 것만으로는
+      충분한 해결이 되지 않는데, 이는 앞의 예에서 본것과 같이, 컴파일러의
+      최적화는 barrier() 가 의미하는 바를 지키면서도 컨트롤 의존성을 손상시킬
+      수 있기 때문이라는 점을 부디 알아두시기 바랍니다.
+
+  (*) 컨트롤 의존성은 앞의 로드와 뒤의 스토어 사이에 최소 하나의, 실행
+      시점에서의 조건관계를 필요로 하며, 이 조건관계는 앞의 로드와 관계되어야
+      합니다.  만약 컴파일러가 조건 관계를 최적화로 없앨수 있다면, 순서도
+      최적화로 없애버렸을 겁니다.  READ_ONCE() 와 WRITE_ONCE() 의 주의 깊은
+      사용은 주어진 조건 관계를 유지하는데 도움이 될 수 있습니다.
+
+  (*) 컨트롤 의존성을 위해선 컴파일러가 조건관계를 없애버리는 것을 막아야
+      합니다.  주의 깊은 READ_ONCE() 나 atomic{,64}_read() 의 사용이 컨트롤
+      의존성이 사라지지 않게 하는데 도움을 줄 수 있습니다.  더 많은 정보를
+      위해선 "컴파일러 배리어" 섹션을 참고하시기 바랍니다.
+
+  (*) 컨트롤 의존성은 보통 다른 타입의 배리어들과 짝을 맞춰 사용됩니다.
+
+  (*) 컨트롤 의존성은 이행성을 제공하지 -않습니다-.  이행성이 필요하다면,
+      smp_mb() 를 사용하세요.
+
+
+SMP 배리어 짝맞추기
+--------------------
+
+CPU 간 상호작용을 다룰 때에 일부 타입의 메모리 배리어는 항상 짝을 맞춰
+사용되어야 합니다.  적절하게 짝을 맞추지 않은 코드는 사실상 에러에 가깝습니다.
+
+범용 배리어들은 범용 배리어끼리도 짝을 맞추지만 이행성이 없는 대부분의 다른
+타입의 배리어들과도 짝을 맞춥니다.  ACQUIRE 배리어는 RELEASE 배리어와 짝을
+맞춥니다만, 둘 다 범용 배리어를 포함해 다른 배리어들과도 짝을 맞출 수 있습니다.
+쓰기 배리어는 데이터 의존성 배리어나 컨트롤 의존성, ACQUIRE 배리어, RELEASE
+배리어, 읽기 배리어, 또는 범용 배리어와 짝을 맞춥니다.  비슷하게 읽기 배리어나
+컨트롤 의존성, 또는 데이터 의존성 배리어는 쓰기 배리어나 ACQUIRE 배리어,
+RELEASE 배리어, 또는 범용 배리어와 짝을 맞추는데, 다음과 같습니다:
+
+       CPU 1                 CPU 2
+       ===============       ===============
+       WRITE_ONCE(a, 1);
+       <쓰기 배리어>
+       WRITE_ONCE(b, 2);     x = READ_ONCE(b);
+                             <읽기 배리어>
+                             y = READ_ONCE(a);
+
+또는:
+
+       CPU 1                 CPU 2
+       ===============       ===============================
+       a = 1;
+       <쓰기 배리어>
+       WRITE_ONCE(b, &a);    x = READ_ONCE(b);
+                             <데이터 의존성 배리어>
+                             y = *x;
+
+또는:
+
+       CPU 1                 CPU 2
+       ===============       ===============================
+       r1 = READ_ONCE(y);
+       <범용 배리어>
+       WRITE_ONCE(y, 1);     if (r2 = READ_ONCE(x)) {
+                                <묵시적 컨트롤 의존성>
+                                WRITE_ONCE(y, 1);
+                             }
+
+       assert(r1 == 0 || r2 == 0);
+
+기본적으로, 여기서의 읽기 배리어는 "더 완화된" 타입일 순 있어도 항상 존재해야
+합니다.
+
+[!] 쓰기 배리어 앞의 스토어 오퍼레이션은 일반적으로 읽기 배리어나 데이터
+의존성 배리어 뒤의 로드 오퍼레이션과 매치될 것이고, 반대도 마찬가지입니다:
+
+       CPU 1                               CPU 2
+       ===================                 ===================
+       WRITE_ONCE(a, 1);    }----   --->{  v = READ_ONCE(c);
+       WRITE_ONCE(b, 2);    }    \ /    {  w = READ_ONCE(d);
+       <쓰기 배리어>              \        <읽기 배리어>
+       WRITE_ONCE(c, 3);    }    / \    {  x = READ_ONCE(a);
+       WRITE_ONCE(d, 4);    }----   --->{  y = READ_ONCE(b);
+
+
+메모리 배리어 시퀀스의 예
+-------------------------
+
+첫째, 쓰기 배리어는 스토어 오퍼레이션들의 부분적 순서 세우기로 동작합니다.
+아래의 이벤트 시퀀스를 보세요:
+
+       CPU 1
+       =======================
+       STORE A = 1
+       STORE B = 2
+       STORE C = 3
+       <쓰기 배리어>
+       STORE D = 4
+       STORE E = 5
+
+이 이벤트 시퀀스는 메모리 일관성 시스템에 원소끼리의 순서가 존재하지 않는 집합
+{ STORE A, STORE B, STORE C } 가 역시 원소끼리의 순서가 존재하지 않는 집합
+{ STORE D, STORE E } 보다 먼저 일어난 것으로 시스템의 나머지 요소들에 보이도록
+전달됩니다:
+
+       +-------+       :      :
+       |       |       +------+
+       |       |------>| C=3  |     }     /\
+       |       |  :    +------+     }-----  \  -----> 시스템의 나머지 요소에
+       |       |  :    | A=1  |     }        \/       보여질 수 있는 이벤트들
+       |       |  :    +------+     }
+       | CPU 1 |  :    | B=2  |     }
+       |       |       +------+     }
+       |       |   wwwwwwwwwwwwwwww }   <--- 여기서 쓰기 배리어는 배리어 앞의
+       |       |       +------+     }        모든 스토어가 배리어 뒤의 스토어
+       |       |  :    | E=5  |     }        전에 메모리 시스템에 전달되도록
+       |       |  :    +------+     }        합니다
+       |       |------>| D=4  |     }
+       |       |       +------+
+       +-------+       :      :
+                          |
+                          | CPU 1 에 의해 메모리 시스템에 전달되는
+                          | 일련의 스토어 오퍼레이션들
+                          V
+
+
+둘째, 데이터 의존성 배리어는 데이터 의존적 로드 오퍼레이션들의 부분적 순서
+세우기로 동작합니다.  다음 일련의 이벤트들을 보세요:
+
+       CPU 1                   CPU 2
+       ======================= =======================
+               { B = 7; X = 9; Y = 8; C = &Y }
+       STORE A = 1
+       STORE B = 2
+       <쓰기 배리어>
+       STORE C = &B            LOAD X
+       STORE D = 4             LOAD C (gets &B)
+                               LOAD *C (reads B)
+
+여기에 별다른 개입이 없다면, CPU 1 의 쓰기 배리어에도 불구하고 CPU 2 는 CPU 1
+의 이벤트들을 완전히 무작위적 순서로 인지하게 됩니다:
+
+       +-------+       :      :                :       :
+       |       |       +------+                +-------+  | CPU 2 에 인지되는
+       |       |------>| B=2  |-----       --->| Y->8  |  | 업데이트 이벤트
+       |       |  :    +------+     \          +-------+  | 시퀀스
+       | CPU 1 |  :    | A=1  |      \     --->| C->&Y |  V
+       |       |       +------+       |        +-------+
+       |       |   wwwwwwwwwwwwwwww   |        :       :
+       |       |       +------+       |        :       :
+       |       |  :    | C=&B |---    |        :       :       +-------+
+       |       |  :    +------+   \   |        +-------+       |       |
+       |       |------>| D=4  |    ----------->| C->&B |------>|       |
+       |       |       +------+       |        +-------+       |       |
+       +-------+       :      :       |        :       :       |       |
+                                      |        :       :       |       |
+                                      |        :       :       | CPU 2 |
+                                      |        +-------+       |       |
+           분명히 잘못된        --->  |        | B->7  |------>|       |
+           B 의 값 인지 (!)           |        +-------+       |       |
+                                      |        :       :       |       |
+                                      |        +-------+       |       |
+           X 의 로드가 B 의    --->    \       | X->9  |------>|       |
+           일관성 유지를                \      +-------+       |       |
+           지연시킴                      ----->| B->2  |       +-------+
+                                               +-------+
+                                               :       :
+
+
+앞의 예에서, CPU 2 는 (B 의 값이 될) *C 의 값 읽기가 C 의 LOAD 뒤에 이어짐에도
+B 가 7 이라는 결과를 얻습니다.
+
+하지만, 만약 데이터 의존성 배리어가 C 의 로드와 *C (즉, B) 의 로드 사이에
+있었다면:
+
+       CPU 1                   CPU 2
+       ======================= =======================
+               { B = 7; X = 9; Y = 8; C = &Y }
+       STORE A = 1
+       STORE B = 2
+       <쓰기 배리어>
+       STORE C = &B            LOAD X
+       STORE D = 4             LOAD C (gets &B)
+                               <데이터 의존성 배리어>
+                               LOAD *C (reads B)
+
+다음과 같이 됩니다:
+
+       +-------+       :      :                :       :
+       |       |       +------+                +-------+
+       |       |------>| B=2  |-----       --->| Y->8  |
+       |       |  :    +------+     \          +-------+
+       | CPU 1 |  :    | A=1  |      \     --->| C->&Y |
+       |       |       +------+       |        +-------+
+       |       |   wwwwwwwwwwwwwwww   |        :       :
+       |       |       +------+       |        :       :
+       |       |  :    | C=&B |---    |        :       :       +-------+
+       |       |  :    +------+   \   |        +-------+       |       |
+       |       |------>| D=4  |    ----------->| C->&B |------>|       |
+       |       |       +------+       |        +-------+       |       |
+       +-------+       :      :       |        :       :       |       |
+                                      |        :       :       |       |
+                                      |        :       :       | CPU 2 |
+                                      |        +-------+       |       |
+                                      |        | X->9  |------>|       |
+                                      |        +-------+       |       |
+         C 로의 스토어 앞의     --->   \   ddddddddddddddddd   |       |
+         모든 이벤트 결과가             \      +-------+       |       |
+         뒤의 로드에게                   ----->| B->2  |------>|       |
+         보이게 강제한다                       +-------+       |       |
+                                               :       :       +-------+
+
+
+셋째, 읽기 배리어는 로드 오퍼레이션들에의 부분적 순서 세우기로 동작합니다.
+아래의 일련의 이벤트를 봅시다:
+
+       CPU 1                   CPU 2
+       ======================= =======================
+               { A = 0, B = 9 }
+       STORE A=1
+       <쓰기 배리어>
+       STORE B=2
+                               LOAD B
+                               LOAD A
+
+CPU 1 은 쓰기 배리어를 쳤지만, 별다른 개입이 없다면 CPU 2 는 CPU 1 에서 행해진
+이벤트의 결과를 무작위적 순서로 인지하게 됩니다.
+
+       +-------+       :      :                :       :
+       |       |       +------+                +-------+
+       |       |------>| A=1  |------      --->| A->0  |
+       |       |       +------+      \         +-------+
+       | CPU 1 |   wwwwwwwwwwwwwwww   \    --->| B->9  |
+       |       |       +------+        |       +-------+
+       |       |------>| B=2  |---     |       :       :
+       |       |       +------+   \    |       :       :       +-------+
+       +-------+       :      :    \   |       +-------+       |       |
+                                    ---------->| B->2  |------>|       |
+                                       |       +-------+       | CPU 2 |
+                                       |       | A->0  |------>|       |
+                                       |       +-------+       |       |
+                                       |       :       :       +-------+
+                                        \      :       :
+                                         \     +-------+
+                                          ---->| A->1  |
+                                               +-------+
+                                               :       :
+
+
+하지만, 만약 읽기 배리어가 B 의 로드와 A 의 로드 사이에 존재한다면:
+
+       CPU 1                   CPU 2
+       ======================= =======================
+               { A = 0, B = 9 }
+       STORE A=1
+       <쓰기 배리어>
+       STORE B=2
+                               LOAD B
+                               <읽기 배리어>
+                               LOAD A
+
+CPU 1 에 의해 만들어진 부분적 순서가 CPU 2 에도 그대로 인지됩니다:
+
+       +-------+       :      :                :       :
+       |       |       +------+                +-------+
+       |       |------>| A=1  |------      --->| A->0  |
+       |       |       +------+      \         +-------+
+       | CPU 1 |   wwwwwwwwwwwwwwww   \    --->| B->9  |
+       |       |       +------+        |       +-------+
+       |       |------>| B=2  |---     |       :       :
+       |       |       +------+   \    |       :       :       +-------+
+       +-------+       :      :    \   |       +-------+       |       |
+                                    ---------->| B->2  |------>|       |
+                                       |       +-------+       | CPU 2 |
+                                       |       :       :       |       |
+                                       |       :       :       |       |
+         여기서 읽기 배리어는   ---->   \  rrrrrrrrrrrrrrrrr   |       |
+         B 로의 스토어 전의              \     +-------+       |       |
+         모든 결과를 CPU 2 에             ---->| A->1  |------>|       |
+         보이도록 한다                         +-------+       |       |
+                                               :       :       +-------+
+
+
+더 완벽한 설명을 위해, A 의 로드가 읽기 배리어 앞과 뒤에 있으면 어떻게 될지
+생각해 봅시다:
+
+       CPU 1                   CPU 2
+       ======================= =======================
+               { A = 0, B = 9 }
+       STORE A=1
+       <쓰기 배리어>
+       STORE B=2
+                               LOAD B
+                               LOAD A [first load of A]
+                               <읽기 배리어>
+                               LOAD A [second load of A]
+
+A 의 로드 두개가 모두 B 의 로드 뒤에 있지만, 서로 다른 값을 얻어올 수
+있습니다:
+
+       +-------+       :      :                :       :
+       |       |       +------+                +-------+
+       |       |------>| A=1  |------      --->| A->0  |
+       |       |       +------+      \         +-------+
+       | CPU 1 |   wwwwwwwwwwwwwwww   \    --->| B->9  |
+       |       |       +------+        |       +-------+
+       |       |------>| B=2  |---     |       :       :
+       |       |       +------+   \    |       :       :       +-------+
+       +-------+       :      :    \   |       +-------+       |       |
+                                    ---------->| B->2  |------>|       |
+                                       |       +-------+       | CPU 2 |
+                                       |       :       :       |       |
+                                       |       :       :       |       |
+                                       |       +-------+       |       |
+                                       |       | A->0  |------>| 1st   |
+                                       |       +-------+       |       |
+         여기서 읽기 배리어는   ---->   \  rrrrrrrrrrrrrrrrr   |       |
+         B 로의 스토어 전의              \     +-------+       |       |
+         모든 결과를 CPU 2 에             ---->| A->1  |------>| 2nd   |
+         보이도록 한다                         +-------+       |       |
+                                               :       :       +-------+
+
+
+하지만 CPU 1 에서의 A 업데이트는 읽기 배리어가 완료되기 전에도 보일 수도
+있긴 합니다:
+
+       +-------+       :      :                :       :
+       |       |       +------+                +-------+
+       |       |------>| A=1  |------      --->| A->0  |
+       |       |       +------+      \         +-------+
+       | CPU 1 |   wwwwwwwwwwwwwwww   \    --->| B->9  |
+       |       |       +------+        |       +-------+
+       |       |------>| B=2  |---     |       :       :
+       |       |       +------+   \    |       :       :       +-------+
+       +-------+       :      :    \   |       +-------+       |       |
+                                    ---------->| B->2  |------>|       |
+                                       |       +-------+       | CPU 2 |
+                                       |       :       :       |       |
+                                        \      :       :       |       |
+                                         \     +-------+       |       |
+                                          ---->| A->1  |------>| 1st   |
+                                               +-------+       |       |
+                                           rrrrrrrrrrrrrrrrr   |       |
+                                               +-------+       |       |
+                                               | A->1  |------>| 2nd   |
+                                               +-------+       |       |
+                                               :       :       +-------+
+
+
+여기서 보장되는 건, 만약 B 의 로드가 B == 2 라는 결과를 봤다면, A 에의 두번째
+로드는 항상 A == 1 을 보게 될 것이라는 겁니다.  A 에의 첫번째 로드에는 그런
+보장이 없습니다; A == 0 이거나 A == 1 이거나 둘 중 하나의 결과를 보게 될겁니다.
+
+
+읽기 메모리 배리어 VS 로드 예측
+-------------------------------
+
+많은 CPU들이 로드를 예측적으로 (speculatively) 합니다: 어떤 데이터를 메모리에서
+로드해야 하게 될지 예측을 했다면, 해당 데이터를 로드하는 인스트럭션을 실제로는
+아직 만나지 않았더라도 다른 로드 작업이 없어 버스 (bus) 가 아무 일도 하고 있지
+않다면, 그 데이터를 로드합니다.  이후에 실제 로드 인스트럭션이 실행되면 CPU 가
+이미 그 값을 가지고 있기 때문에 그 로드 인스트럭션은 즉시 완료됩니다.
+
+해당 CPU 는 실제로는 그 값이 필요치 않았다는 사실이 나중에 드러날 수도 있는데 -
+해당 로드 인스트럭션이 브랜치로 우회되거나 했을 수 있겠죠 - , 그렇게 되면 앞서
+읽어둔 값을 버리거나 나중의 사용을 위해 캐시에 넣어둘 수 있습니다.
+
+다음을 생각해 봅시다:
+
+       CPU 1                   CPU 2
+       ======================= =======================
+                               LOAD B
+                               DIVIDE          } 나누기 명령은 일반적으로
+                               DIVIDE          } 긴 시간을 필요로 합니다
+                               LOAD A
+
+는 이렇게 될 수 있습니다:
+
+                                               :       :       +-------+
+                                               +-------+       |       |
+                                           --->| B->2  |------>|       |
+                                               +-------+       | CPU 2 |
+                                               :       :DIVIDE |       |
+                                               +-------+       |       |
+       나누기 하느라 바쁜       --->       --->| A->0  |~~~~   |       |
+       CPU 는 A 의 LOAD 를                     +-------+   ~   |       |
+       예측해서 수행한다                       :       :   ~   |       |
+                                               :       :DIVIDE |       |
+                                               :       :   ~   |       |
+       나누기가 끝나면       --->     --->     :       :   ~-->|       |
+       CPU 는 해당 LOAD 를                     :       :       |       |
+       즉각 완료한다                           :       :       +-------+
+
+
+읽기 배리어나 데이터 의존성 배리어를 두번째 로드 직전에 놓는다면:
+
+       CPU 1                   CPU 2
+       ======================= =======================
+                               LOAD B
+                               DIVIDE
+                               DIVIDE
+                               <읽기 배리어>
+                               LOAD A
+
+예측으로 얻어진 값은 사용된 배리어의 타입에 따라서 해당 값이 옳은지 검토되게
+됩니다.  만약 해당 메모리 영역에 변화가 없었다면, 예측으로 얻어두었던 값이
+사용됩니다:
+
+                                               :       :       +-------+
+                                               +-------+       |       |
+                                           --->| B->2  |------>|       |
+                                               +-------+       | CPU 2 |
+                                               :       :DIVIDE |       |
+                                               +-------+       |       |
+       나누기 하느라 바쁜       --->       --->| A->0  |~~~~   |       |
+       CPU 는 A 의 LOAD 를                     +-------+   ~   |       |
+       예측한다                                :       :   ~   |       |
+                                               :       :DIVIDE |       |
+                                               :       :   ~   |       |
+                                               :       :   ~   |       |
+                                           rrrrrrrrrrrrrrrr~   |       |
+                                               :       :   ~   |       |
+                                               :       :   ~-->|       |
+                                               :       :       |       |
+                                               :       :       +-------+
+
+
+하지만 다른 CPU 에서 업데이트나 무효화가 있었다면, 그 예측은 무효화되고 그 값은
+다시 읽혀집니다:
+
+                                               :       :       +-------+
+                                               +-------+       |       |
+                                           --->| B->2  |------>|       |
+                                               +-------+       | CPU 2 |
+                                               :       :DIVIDE |       |
+                                               +-------+       |       |
+       나누기 하느라 바쁜       --->       --->| A->0  |~~~~   |       |
+       CPU 는 A 의 LOAD 를                     +-------+   ~   |       |
+       예측한다                                :       :   ~   |       |
+                                               :       :DIVIDE |       |
+                                               :       :   ~   |       |
+                                               :       :   ~   |       |
+                                           rrrrrrrrrrrrrrrrr   |       |
+                                               +-------+       |       |
+       예측성 동작은 무효화 되고    --->   --->| A->1  |------>|       |
+       업데이트된 값이 다시 읽혀진다           +-------+       |       |
+                                               :       :       +-------+
+
+
+이행성
+------
+
+이행성(transitivity)은 실제의 컴퓨터 시스템에서 항상 제공되지는 않는, 순서
+맞추기에 대한 상당히 직관적인 개념입니다.  다음의 예가 이행성을 보여줍니다:
+
+       CPU 1                   CPU 2                   CPU 3
+       ======================= ======================= =======================
+               { X = 0, Y = 0 }
+       STORE X=1               LOAD X                  STORE Y=1
+                               <범용 배리어>              <범용 배리어>
+                               LOAD Y                  LOAD X
+
+CPU 2 의 X 로드가 1을 리턴했고 Y 로드가 0을 리턴했다고 해봅시다.  이는 CPU 2 의
+X 로드가 CPU 1 의 X 스토어 뒤에 이루어졌고 CPU 2 의 Y 로드는 CPU 3 의 Y 스토어
+전에 이루어졌음을 의미합니다.  그럼 "CPU 3 의 X 로드는 0을 리턴할 수 있나요?"
+
+CPU 2 의 X 로드는 CPU 1 의 스토어 후에 이루어졌으니, CPU 3 의 X 로드는 1을
+리턴하는게 자연스럽습니다.  이런 생각이 이행성의 한 예입니다: CPU A 에서 실행된
+로드가 CPU B 에서의 같은 변수에 대한 로드를 뒤따른다면, CPU A 의 로드는 CPU B
+의 로드가 내놓은 값과 같거나 그 후의 값을 내놓아야 합니다.
+
+리눅스 커널에서 범용 배리어의 사용은 이행성을 보장합니다.  따라서, 앞의 예에서
+CPU 2 의 X 로드가 1을, Y 로드는 0을 리턴했다면, CPU 3 의 X 로드는 반드시 1을
+리턴합니다.
+
+하지만, 읽기나 쓰기 배리어에 대해서는 이행성이 보장되지 -않습니다-.  예를 들어,
+앞의 예에서 CPU 2 의 범용 배리어가 아래처럼 읽기 배리어로 바뀐 경우를 생각해
+봅시다:
+
+       CPU 1                   CPU 2                   CPU 3
+       ======================= ======================= =======================
+               { X = 0, Y = 0 }
+       STORE X=1               LOAD X                  STORE Y=1
+                               <읽기 배리어>              <범용 배리어>
+                               LOAD Y                  LOAD X
+
+이 코드는 이행성을 갖지 않습니다: 이 예에서는, CPU 2 의 X 로드가 1을
+리턴하고, Y 로드는 0을 리턴하지만 CPU 3 의 X 로드가 0을 리턴하는 것도 완전히
+합법적입니다.
+
+CPU 2 의 읽기 배리어가 자신의 읽기는 순서를 맞춰줘도, CPU 1 의 스토어와의
+순서를 맞춰준다고는 보장할 수 없다는게 핵심입니다.  따라서, CPU 1 과 CPU 2 가
+버퍼나 캐시를 공유하는 시스템에서 이 예제 코드가 실행된다면, CPU 2 는 CPU 1 이
+쓴 값에 좀 빨리 접근할 수 있을 것입니다.  따라서 CPU 1 과 CPU 2 의 접근으로
+조합된 순서를 모든 CPU 가 동의할 수 있도록 하기 위해 범용 배리어가 필요합니다.
+
+범용 배리어는 "글로벌 이행성"을 제공해서, 모든 CPU 들이 오퍼레이션들의 순서에
+동의하게 할 것입니다.  반면, release-acquire 조합은 "로컬 이행성" 만을
+제공해서, 해당 조합이 사용된 CPU 들만이 해당 액세스들의 조합된 순서에 동의함이
+보장됩니다.  예를 들어, 존경스런 Herman Hollerith 의 C 코드로 보면:
+
+       int u, v, x, y, z;
+
+       void cpu0(void)
+       {
+               r0 = smp_load_acquire(&x);
+               WRITE_ONCE(u, 1);
+               smp_store_release(&y, 1);
+       }
+
+       void cpu1(void)
+       {
+               r1 = smp_load_acquire(&y);
+               r4 = READ_ONCE(v);
+               r5 = READ_ONCE(u);
+               smp_store_release(&z, 1);
+       }
+
+       void cpu2(void)
+       {
+               r2 = smp_load_acquire(&z);
+               smp_store_release(&x, 1);
+       }
+
+       void cpu3(void)
+       {
+               WRITE_ONCE(v, 1);
+               smp_mb();
+               r3 = READ_ONCE(u);
+       }
+
+cpu0(), cpu1(), 그리고 cpu2() 는 smp_store_release()/smp_load_acquire() 쌍의
+연결을 통한 로컬 이행성에 동참하고 있으므로, 다음과 같은 결과는 나오지 않을
+겁니다:
+
+       r0 == 1 && r1 == 1 && r2 == 1
+
+더 나아가서, cpu0() 와 cpu1() 사이의 release-acquire 관계로 인해, cpu1() 은
+cpu0() 의 쓰기를 봐야만 하므로, 다음과 같은 결과도 없을 겁니다:
+
+       r1 == 1 && r5 == 0
+
+하지만, release-acquire 타동성은 동참한 CPU 들에만 적용되므로 cpu3() 에는
+적용되지 않습니다.  따라서, 다음과 같은 결과가 가능합니다:
+
+       r0 == 0 && r1 == 1 && r2 == 1 && r3 == 0 && r4 == 0
+
+비슷하게, 다음과 같은 결과도 가능합니다:
+
+       r0 == 0 && r1 == 1 && r2 == 1 && r3 == 0 && r4 == 0 && r5 == 1
+
+cpu0(), cpu1(), 그리고 cpu2() 는 그들의 읽기와 쓰기를 순서대로 보게 되지만,
+release-acquire 체인에 관여되지 않은 CPU 들은 그 순서에 이견을 가질 수
+있습니다.  이런 이견은 smp_load_acquire() 와 smp_store_release() 의 구현에
+사용되는 완화된 메모리 배리어 인스트럭션들은 항상 배리어 앞의 스토어들을 뒤의
+로드들에 앞세울 필요는 없다는 사실에서 기인합니다.  이 말은 cpu3() 는 cpu0() 의
+u 로의 스토어를 cpu1() 의 v 로부터의 로드 뒤에 일어난 것으로 볼 수 있다는
+뜻입니다, cpu0() 와 cpu1() 은 이 두 오퍼레이션이 의도된 순서대로 일어났음에
+모두 동의하는데도 말입니다.
+
+하지만, smp_load_acquire() 는 마술이 아님을 명심하시기 바랍니다.  구체적으로,
+이 함수는 단순히 순서 규칙을 지키며 인자로부터의 읽기를 수행합니다.  이것은
+어떤 특정한 값이 읽힐 것인지는 보장하지 -않습니다-.  따라서, 다음과 같은 결과도
+가능합니다:
+
+       r0 == 0 && r1 == 0 && r2 == 0 && r5 == 0
+
+이런 결과는 어떤 것도 재배치 되지 않는, 순차적 일관성을 가진 가상의
+시스템에서도 일어날 수 있음을 기억해 두시기 바랍니다.
+
+다시 말하지만, 당신의 코드가 글로벌 이행성을 필요로 한다면, 범용 배리어를
+사용하십시오.
+
+
+==================
+명시적 커널 배리어
+==================
+
+리눅스 커널은 서로 다른 단계에서 동작하는 다양한 배리어들을 가지고 있습니다:
+
+  (*) 컴파일러 배리어.
+
+  (*) CPU 메모리 배리어.
+
+  (*) MMIO 쓰기 배리어.
+
+
+컴파일러 배리어
+---------------
+
+리눅스 커널은 컴파일러가 메모리 액세스를 재배치 하는 것을 막아주는 명시적인
+컴파일러 배리어를 가지고 있습니다:
+
+       barrier();
+
+이건 범용 배리어입니다 -- barrier() 의 읽기-읽기 나 쓰기-쓰기 변종은 없습니다.
+하지만, READ_ONCE() 와 WRITE_ONCE() 는 특정 액세스들에 대해서만 동작하는
+barrier() 의 완화된 형태로 볼 수 있습니다.
+
+barrier() 함수는 다음과 같은 효과를 갖습니다:
+
+ (*) 컴파일러가 barrier() 뒤의 액세스들이 barrier() 앞의 액세스보다 앞으로
+     재배치되지 못하게 합니다.  예를 들어, 인터럽트 핸들러 코드와 인터럽트 당한
+     코드 사이의 통신을 신중히 하기 위해 사용될 수 있습니다.
+
+ (*) 루프에서, 컴파일러가 루프 조건에 사용된 변수를 매 이터레이션마다
+     메모리에서 로드하지 않아도 되도록 최적화 하는걸 방지합니다.
+
+READ_ONCE() 와 WRITE_ONCE() 함수는 싱글 쓰레드 코드에서는 문제 없지만 동시성이
+있는 코드에서는 문제가 될 수 있는 모든 최적화를 막습니다.  이런 류의 최적화에
+대한 예를 몇가지 들어보면 다음과 같습니다:
+
+ (*) 컴파일러는 같은 변수에 대한 로드와 스토어를 재배치 할 수 있고, 어떤
+     경우에는 CPU가 같은 변수로부터의 로드들을 재배치할 수도 있습니다.  이는
+     다음의 코드가:
+
+       a[0] = x;
+       a[1] = x;
+
+     x 의 예전 값이 a[1] 에, 새 값이 a[0] 에 있게 할 수 있다는 뜻입니다.
+     컴파일러와 CPU가 이런 일을 못하게 하려면 다음과 같이 해야 합니다:
+
+       a[0] = READ_ONCE(x);
+       a[1] = READ_ONCE(x);
+
+     즉, READ_ONCE() 와 WRITE_ONCE() 는 여러 CPU 에서 하나의 변수에 가해지는
+     액세스들에 캐시 일관성을 제공합니다.
+
+ (*) 컴파일러는 같은 변수에 대한 연속적인 로드들을 병합할 수 있습니다.  그런
+     병합 작업으로 컴파일러는 다음의 코드를:
+
+       while (tmp = a)
+               do_something_with(tmp);
+
+     다음과 같이, 싱글 쓰레드 코드에서는 말이 되지만 개발자의 의도와 전혀 맞지
+     않는 방향으로 "최적화" 할 수 있습니다:
+
+       if (tmp = a)
+               for (;;)
+                       do_something_with(tmp);
+
+     컴파일러가 이런 짓을 하지 못하게 하려면 READ_ONCE() 를 사용하세요:
+
+       while (tmp = READ_ONCE(a))
+               do_something_with(tmp);
+
+ (*) 예컨대 레지스터 사용량이 많아 컴파일러가 모든 데이터를 레지스터에 담을 수
+     없는 경우, 컴파일러는 변수를 다시 로드할 수 있습니다.  따라서 컴파일러는
+     앞의 예에서 변수 'tmp' 사용을 최적화로 없애버릴 수 있습니다:
+
+       while (tmp = a)
+               do_something_with(tmp);
+
+     이 코드는 다음과 같이 싱글 쓰레드에서는 완벽하지만 동시성이 존재하는
+     경우엔 치명적인 코드로 바뀔 수 있습니다:
+
+       while (a)
+               do_something_with(a);
+
+     예를 들어, 최적화된 이 코드는 변수 a 가 다른 CPU 에 의해 "while" 문과
+     do_something_with() 호출 사이에 바뀌어 do_something_with() 에 0을 넘길
+     수도 있습니다.
+
+     이번에도, 컴파일러가 그런 짓을 하는걸 막기 위해 READ_ONCE() 를 사용하세요:
+
+       while (tmp = READ_ONCE(a))
+               do_something_with(tmp);
+
+     레지스터가 부족한 상황을 겪는 경우, 컴파일러는 tmp 를 스택에 저장해둘 수도
+     있습니다.  컴파일러가 변수를 다시 읽어들이는건 이렇게 저장해두고 후에 다시
+     읽어들이는데 드는 오버헤드 때문입니다.  그렇게 하는게 싱글 쓰레드
+     코드에서는 안전하므로, 안전하지 않은 경우에는 컴파일러에게 직접 알려줘야
+     합니다.
+
+ (*) 컴파일러는 그 값이 무엇일지 알고 있다면 로드를 아예 안할 수도 있습니다.
+     예를 들어, 다음의 코드는 변수 'a' 의 값이 항상 0임을 증명할 수 있다면:
+
+       while (tmp = a)
+               do_something_with(tmp);
+
+     이렇게 최적화 되어버릴 수 있습니다:
+
+       do { } while (0);
+
+     이 변환은 싱글 쓰레드 코드에서는 도움이 되는데 로드와 브랜치를 제거했기
+     때문입니다.  문제는 컴파일러가 'a' 의 값을 업데이트 하는건 현재의 CPU 하나
+     뿐이라는 가정 위에서 증명을 했다는데 있습니다.  만약 변수 'a' 가 공유되어
+     있다면, 컴파일러의 증명은 틀린 것이 될겁니다.  컴파일러는 그 자신이
+     생각하는 것만큼 많은 것을 알고 있지 못함을 컴파일러에게 알리기 위해
+     READ_ONCE() 를 사용하세요:
+
+       while (tmp = READ_ONCE(a))
+               do_something_with(tmp);
+
+     하지만 컴파일러는 READ_ONCE() 뒤에 나오는 값에 대해서도 눈길을 두고 있음을
+     기억하세요.  예를 들어, 다음의 코드에서 MAX 는 전처리기 매크로로, 1의 값을
+     갖는다고 해봅시다:
+
+       while ((tmp = READ_ONCE(a)) % MAX)
+               do_something_with(tmp);
+
+     이렇게 되면 컴파일러는 MAX 를 가지고 수행되는 "%" 오퍼레이터의 결과가 항상
+     0이라는 것을 알게 되고, 컴파일러가 코드를 실질적으로는 존재하지 않는
+     것처럼 최적화 하는 것이 허용되어 버립니다.  ('a' 변수의 로드는 여전히
+     행해질 겁니다.)
+
+ (*) 비슷하게, 컴파일러는 변수가 저장하려 하는 값을 이미 가지고 있다는 것을
+     알면 스토어 자체를 제거할 수 있습니다.  이번에도, 컴파일러는 현재의 CPU
+     만이 그 변수에 값을 쓰는 오로지 하나의 존재라고 생각하여 공유된 변수에
+     대해서는 잘못된 일을 하게 됩니다.  예를 들어, 다음과 같은 경우가 있을 수
+     있습니다:
+
+       a = 0;
+       ... 변수 a 에 스토어를 하지 않는 코드 ...
+       a = 0;
+
+     컴파일러는 변수 'a' 의 값은 이미 0이라는 것을 알고, 따라서 두번째 스토어를
+     삭제할 겁니다.  만약 다른 CPU 가 그 사이 변수 'a' 에 다른 값을 썼다면
+     황당한 결과가 나올 겁니다.
+
+     컴파일러가 그런 잘못된 추측을 하지 않도록 WRITE_ONCE() 를 사용하세요:
+
+       WRITE_ONCE(a, 0);
+       ... 변수 a 에 스토어를 하지 않는 코드 ...
+       WRITE_ONCE(a, 0);
+
+ (*) 컴파일러는 하지 말라고 하지 않으면 메모리 액세스들을 재배치 할 수
+     있습니다.  예를 들어, 다음의 프로세스 레벨 코드와 인터럽트 핸들러 사이의
+     상호작용을 생각해 봅시다:
+
+       void process_level(void)
+       {
+               msg = get_message();
+               flag = true;
+       }
+
+       void interrupt_handler(void)
+       {
+               if (flag)
+                       process_message(msg);
+       }
+
+     이 코드에는 컴파일러가 process_level() 을 다음과 같이 변환하는 것을 막을
+     수단이 없고, 이런 변환은 싱글쓰레드에서라면 실제로 훌륭한 선택일 수
+     있습니다:
+
+       void process_level(void)
+       {
+               flag = true;
+               msg = get_message();
+       }
+
+     이 두개의 문장 사이에 인터럽트가 발생한다면, interrupt_handler() 는 의미를
+     알 수 없는 메세지를 받을 수도 있습니다.  이걸 막기 위해 다음과 같이
+     WRITE_ONCE() 를 사용하세요:
+
+       void process_level(void)
+       {
+               WRITE_ONCE(msg, get_message());
+               WRITE_ONCE(flag, true);
+       }
+
+       void interrupt_handler(void)
+       {
+               if (READ_ONCE(flag))
+                       process_message(READ_ONCE(msg));
+       }
+
+     interrupt_handler() 안에서도 중첩된 인터럽트나 NMI 와 같이 인터럽트 핸들러
+     역시 'flag' 와 'msg' 에 접근하는 또다른 무언가에 인터럽트 될 수 있다면
+     READ_ONCE() 와 WRITE_ONCE() 를 사용해야 함을 기억해 두세요.  만약 그런
+     가능성이 없다면, interrupt_handler() 안에서는 문서화 목적이 아니라면
+     READ_ONCE() 와 WRITE_ONCE() 는 필요치 않습니다.  (근래의 리눅스 커널에서
+     중첩된 인터럽트는 보통 잘 일어나지 않음도 기억해 두세요, 실제로, 어떤
+     인터럽트 핸들러가 인터럽트가 활성화된 채로 리턴하면 WARN_ONCE() 가
+     실행됩니다.)
+
+     컴파일러는 READ_ONCE() 와 WRITE_ONCE() 뒤의 READ_ONCE() 나 WRITE_ONCE(),
+     barrier(), 또는 비슷한 것들을 담고 있지 않은 코드를 움직일 수 있을 것으로
+     가정되어야 합니다.
+
+     이 효과는 barrier() 를 통해서도 만들 수 있지만, READ_ONCE() 와
+     WRITE_ONCE() 가 좀 더 안목 높은 선택입니다: READ_ONCE() 와 WRITE_ONCE()는
+     컴파일러에 주어진 메모리 영역에 대해서만 최적화 가능성을 포기하도록
+     하지만, barrier() 는 컴파일러가 지금까지 기계의 레지스터에 캐시해 놓은
+     모든 메모리 영역의 값을 버려야 하게 하기 때문입니다.  물론, 컴파일러는
+     READ_ONCE() 와 WRITE_ONCE() 가 일어난 순서도 지켜줍니다, CPU 는 당연히
+     그 순서를 지킬 의무가 없지만요.
+
+ (*) 컴파일러는 다음의 예에서와 같이 변수에의 스토어를 날조해낼 수도 있습니다:
+
+       if (a)
+               b = a;
+       else
+               b = 42;
+
+     컴파일러는 아래와 같은 최적화로 브랜치를 줄일 겁니다:
+
+       b = 42;
+       if (a)
+               b = a;
+
+     싱글 쓰레드 코드에서 이 최적화는 안전할 뿐 아니라 브랜치 갯수를
+     줄여줍니다.  하지만 안타깝게도, 동시성이 있는 코드에서는 이 최적화는 다른
+     CPU 가 'b' 를 로드할 때, -- 'a' 가 0이 아닌데도 -- 가짜인 값, 42를 보게
+     되는 경우를 가능하게 합니다.  이걸 방지하기 위해 WRITE_ONCE() 를
+     사용하세요:
+
+       if (a)
+               WRITE_ONCE(b, a);
+       else
+               WRITE_ONCE(b, 42);
+
+     컴파일러는 로드를 만들어낼 수도 있습니다.  일반적으로는 문제를 일으키지
+     않지만, 캐시 라인 바운싱을 일으켜 성능과 확장성을 떨어뜨릴 수 있습니다.
+     날조된 로드를 막기 위해선 READ_ONCE() 를 사용하세요.
+
+ (*) 정렬된 메모리 주소에 위치한, 한번의 메모리 참조 인스트럭션으로 액세스
+     가능한 크기의 데이터는 하나의 큰 액세스가 여러개의 작은 액세스들로
+     대체되는 "로드 티어링(load tearing)" 과 "스토어 티어링(store tearing)" 을
+     방지합니다.  예를 들어, 주어진 아키텍쳐가 7-bit imeediate field 를 갖는
+     16-bit 스토어 인스트럭션을 제공한다면, 컴파일러는 다음의 32-bit 스토어를
+     구현하는데에 두개의 16-bit store-immediate 명령을 사용하려 할겁니다:
+
+       p = 0x00010002;
+
+     스토어 할 상수를 만들고 그 값을 스토어 하기 위해 두개가 넘는 인스트럭션을
+     사용하게 되는, 이런 종류의 최적화를 GCC 는 실제로 함을 부디 알아 두십시오.
+     이 최적화는 싱글 쓰레드 코드에서는 성공적인 최적화 입니다.  실제로, 근래에
+     발생한 (그리고 고쳐진) 버그는 GCC 가 volatile 스토어에 비정상적으로 이
+     최적화를 사용하게 했습니다.  그런 버그가 없다면, 다음의 예에서
+     WRITE_ONCE() 의 사용은 스토어 티어링을 방지합니다:
+
+       WRITE_ONCE(p, 0x00010002);
+
+     Packed 구조체의 사용 역시 다음의 예처럼  로드 / 스토어 티어링을 유발할 수
+     있습니다:
+
+       struct __attribute__((__packed__)) foo {
+               short a;
+               int b;
+               short c;
+       };
+       struct foo foo1, foo2;
+       ...
+
+       foo2.a = foo1.a;
+       foo2.b = foo1.b;
+       foo2.c = foo1.c;
+
+     READ_ONCE() 나 WRITE_ONCE() 도 없고 volatile 마킹도 없기 때문에,
+     컴파일러는 이 세개의 대입문을 두개의 32-bit 로드와 두개의 32-bit 스토어로
+     변환할 수 있습니다.  이는 'foo1.b' 의 값의 로드 티어링과 'foo2.b' 의
+     스토어 티어링을 초래할 겁니다.  이 예에서도 READ_ONCE() 와 WRITE_ONCE()
+     가 티어링을 막을 수 있습니다:
+
+       foo2.a = foo1.a;
+       WRITE_ONCE(foo2.b, READ_ONCE(foo1.b));
+       foo2.c = foo1.c;
+
+그렇지만, volatile 로 마크된 변수에 대해서는 READ_ONCE() 와 WRITE_ONCE() 가
+필요치 않습니다.  예를 들어, 'jiffies' 는 volatile 로 마크되어 있기 때문에,
+READ_ONCE(jiffies) 라고 할 필요가 없습니다.  READ_ONCE() 와 WRITE_ONCE() 가
+실은 volatile 캐스팅으로 구현되어 있어서 인자가 이미 volatile 로 마크되어
+있다면 또다른 효과를 내지는 않기 때문입니다.
+
+이 컴파일러 배리어들은 CPU 에는 직접적 효과를 전혀 만들지 않기 때문에, 결국은
+재배치가 일어날 수도 있음을 부디 기억해 두십시오.
+
+
+CPU 메모리 배리어
+-----------------
+
+리눅스 커널은 다음의 여덟개 기본 CPU 메모리 배리어를 가지고 있습니다:
+
+       TYPE            MANDATORY               SMP CONDITIONAL
+       =============== ======================= ===========================
+       범용          mb()                    smp_mb()
+       쓰기          wmb()                   smp_wmb()
+       읽기          rmb()                   smp_rmb()
+       데이터 의존성     read_barrier_depends()  smp_read_barrier_depends()
+
+
+데이터 의존성 배리어를 제외한 모든 메모리 배리어는 컴파일러 배리어를
+포함합니다.  데이터 의존성은 컴파일러에의 추가적인 순서 보장을 포함하지
+않습니다.
+
+방백: 데이터 의존성이 있는 경우, 컴파일러는 해당 로드를 올바른 순서로 일으킬
+것으로 (예: `a[b]` 는 a[b] 를 로드 하기 전에 b 의 값을 먼저 로드한다)
+기대되지만, C 언어 사양에는 컴파일러가 b 의 값을 추측 (예: 1 과 같음) 해서
+b  로드 전에 a 로드를 하는 코드 (예: tmp = a[1]; if (b != 1) tmp = a[b]; ) 를
+만들지 않아야 한다는 내용 같은 건 없습니다.  또한 컴파일러는 a[b] 를 로드한
+후에 b 를 또다시 로드할 수도 있어서, a[b] 보다 최신 버전의 b 값을 가질 수도
+있습니다.  이런 문제들의 해결책에 대한 의견 일치는 아직 없습니다만, 일단
+READ_ONCE() 매크로부터 보기 시작하는게 좋은 시작이 될겁니다.
+
+SMP 메모리 배리어들은 유니프로세서로 컴파일된 시스템에서는 컴파일러 배리어로
+바뀌는데, 하나의 CPU 는 스스로 일관성을 유지하고, 겹치는 액세스들 역시 올바른
+순서로 행해질 것으로 생각되기 때문입니다.  하지만, 아래의 "Virtual Machine
+Guests" 서브섹션을 참고하십시오.
+
+[!] SMP 시스템에서 공유메모리로의 접근들을 순서 세워야 할 때, SMP 메모리
+배리어는 _반드시_ 사용되어야 함을 기억하세요, 그대신 락을 사용하는 것으로도
+충분하긴 하지만 말이죠.
+
+Mandatory 배리어들은 SMP 시스템에서도 UP 시스템에서도 SMP 효과만 통제하기에는
+불필요한 오버헤드를 갖기 때문에 SMP 효과만 통제하면 되는 곳에는 사용되지 않아야
+합니다.  하지만, 느슨한 순서 규칙의 메모리 I/O 윈도우를 통한 MMIO 의 효과를
+통제할 때에는 mandatory 배리어들이 사용될 수 있습니다.  이 배리어들은
+컴파일러와 CPU 모두 재배치를 못하도록 함으로써 메모리 오퍼레이션들이 디바이스에
+보여지는 순서에도 영향을 주기 때문에, SMP 가 아닌 시스템이라 할지라도 필요할 수
+있습니다.
+
+
+일부 고급 배리어 함수들도 있습니다:
+
+ (*) smp_store_mb(var, value)
+
+     이 함수는 특정 변수에 특정 값을 대입하고 범용 메모리 배리어를 칩니다.
+     UP 컴파일에서는 컴파일러 배리어보다 더한 것을 친다고는 보장되지 않습니다.
+
+
+ (*) smp_mb__before_atomic();
+ (*) smp_mb__after_atomic();
+
+     이것들은 값을 리턴하지 않는 (더하기, 빼기, 증가, 감소와 같은) 어토믹
+     함수들을 위한, 특히 그것들이 레퍼런스 카운팅에 사용될 때를 위한
+     함수들입니다.  이 함수들은 메모리 배리어를 내포하고 있지는 않습니다.
+
+     이것들은 값을 리턴하지 않으며 어토믹한 (set_bit 과 clear_bit 같은) 비트
+     연산에도 사용될 수 있습니다.
+
+     한 예로, 객체 하나를 무효한 것으로 표시하고 그 객체의 레퍼런스 카운트를
+     감소시키는 다음 코드를 보세요:
+
+       obj->dead = 1;
+       smp_mb__before_atomic();
+       atomic_dec(&obj->ref_count);
+
+     이 코드는 객체의 업데이트된 death 마크가 레퍼런스 카운터 감소 동작
+     *전에* 보일 것을 보장합니다.
+
+     더 많은 정보를 위해선 Documentation/atomic_ops.txt 문서를 참고하세요.
+     어디서 이것들을 사용해야 할지 궁금하다면 "어토믹 오퍼레이션" 서브섹션을
+     참고하세요.
+
+
+ (*) lockless_dereference();
+
+     이 함수는 smp_read_barrier_depends() 데이터 의존성 배리어를 사용하는
+     포인터 읽어오기 래퍼(wrapper) 함수로 생각될 수 있습니다.
+
+     객체의 라이프타임이 RCU 외의 메커니즘으로 관리된다는 점을 제외하면
+     rcu_dereference() 와도 유사한데, 예를 들면 객체가 시스템이 꺼질 때에만
+     제거되는 경우 등입니다.  또한, lockless_dereference() 은 RCU 와 함께
+     사용될수도, RCU 없이 사용될 수도 있는 일부 데이터 구조에 사용되고
+     있습니다.
+
+
+ (*) dma_wmb();
+ (*) dma_rmb();
+
+     이것들은 CPU 와 DMA 가능한 디바이스에서 모두 액세스 가능한 공유 메모리의
+     읽기, 쓰기 작업들의 순서를 보장하기 위해 consistent memory 에서 사용하기
+     위한 것들입니다.
+
+     예를 들어, 디바이스와 메모리를 공유하며, 디스크립터 상태 값을 사용해
+     디스크립터가 디바이스에 속해 있는지 아니면 CPU 에 속해 있는지 표시하고,
+     공지용 초인종(doorbell) 을 사용해 업데이트된 디스크립터가 디바이스에 사용
+     가능해졌음을 공지하는 디바이스 드라이버를 생각해 봅시다:
+
+       if (desc->status != DEVICE_OWN) {
+               /* 디스크립터를 소유하기 전에는 데이터를 읽지 않음 */
+               dma_rmb();
+
+               /* 데이터를 읽고 씀 */
+               read_data = desc->data;
+               desc->data = write_data;
+
+               /* 상태 업데이트 전 수정사항을 반영 */
+               dma_wmb();
+
+               /* 소유권을 수정 */
+               desc->status = DEVICE_OWN;
+
+               /* MMIO 를 통해 디바이스에 공지를 하기 전에 메모리를 동기화 */
+               wmb();
+
+               /* 업데이트된 디스크립터의 디바이스에 공지 */
+               writel(DESC_NOTIFY, doorbell);
+       }
+
+     dma_rmb() 는 디스크립터로부터 데이터를 읽어오기 전에 디바이스가 소유권을
+     내놓았음을 보장하게 하고, dma_wmb() 는 디바이스가 자신이 소유권을 다시
+     가졌음을 보기 전에 디스크립터에 데이터가 쓰였음을 보장합니다.  wmb() 는
+     캐시 일관성이 없는 (cache incoherent) MMIO 영역에 쓰기를 시도하기 전에
+     캐시 일관성이 있는 메모리 (cache coherent memory) 쓰기가 완료되었음을
+     보장해주기 위해 필요합니다.
+
+     consistent memory 에 대한 자세한 내용을 위해선 Documentation/DMA-API.txt
+     문서를 참고하세요.
+
+
+MMIO 쓰기 배리어
+----------------
+
+리눅스 커널은 또한 memory-mapped I/O 쓰기를 위한 특별한 배리어도 가지고
+있습니다:
+
+       mmiowb();
+
+이것은 mandatory 쓰기 배리어의 변종으로, 완화된 순서 규칙의 I/O 영역에으로의
+쓰기가 부분적으로 순서를 맞추도록 해줍니다.  이 함수는 CPU->하드웨어 사이를
+넘어서 실제 하드웨어에까지 일부 수준의 영향을 끼칩니다.
+
+더 많은 정보를 위해선 "Acquire vs I/O 액세스" 서브섹션을 참고하세요.
+
+
+=========================
+암묵적 커널 메모리 배리어
+=========================
+
+리눅스 커널의 일부 함수들은 메모리 배리어를 내장하고 있는데, 락(lock)과
+스케쥴링 관련 함수들이 대부분입니다.
+
+여기선 _최소한의_ 보장을 설명합니다; 특정 아키텍쳐에서는 이 설명보다 더 많은
+보장을 제공할 수도 있습니다만 해당 아키텍쳐에 종속적인 코드 외의 부분에서는
+그런 보장을 기대해선 안될겁니다.
+
+
+락 ACQUISITION 함수
+-------------------
+
+리눅스 커널은 다양한 락 구성체를 가지고 있습니다:
+
+ (*) 스핀 락
+ (*) R/W 스핀 락
+ (*) 뮤텍스
+ (*) 세마포어
+ (*) R/W 세마포어
+
+각 구성체마다 모든 경우에 "ACQUIRE" 오퍼레이션과 "RELEASE" 오퍼레이션의 변종이
+존재합니다.  이 오퍼레이션들은 모두 적절한 배리어를 내포하고 있습니다:
+
+ (1) ACQUIRE 오퍼레이션의 영향:
+
+     ACQUIRE 뒤에서 요청된 메모리 오퍼레이션은 ACQUIRE 오퍼레이션이 완료된
+     뒤에 완료됩니다.
+
+     ACQUIRE 앞에서 요청된 메모리 오퍼레이션은 ACQUIRE 오퍼레이션이 완료된 후에
+     완료될 수 있습니다.  smp_mb__before_spinlock() 뒤에 ACQUIRE 가 실행되는
+     코드 블록은 블록 앞의 스토어를 블록 뒤의 로드와 스토어에 대해 순서
+     맞춥니다.  이건 smp_mb() 보다 완화된 것임을 기억하세요!  많은 아키텍쳐에서
+     smp_mb__before_spinlock() 은 사실 아무일도 하지 않습니다.
+
+ (2) RELEASE 오퍼레이션의 영향:
+
+     RELEASE 앞에서 요청된 메모리 오퍼레이션은 RELEASE 오퍼레이션이 완료되기
+     전에 완료됩니다.
+
+     RELEASE 뒤에서 요청된 메모리 오퍼레이션은 RELEASE 오퍼레이션 완료 전에
+     완료될 수 있습니다.
+
+ (3) ACQUIRE vs ACQUIRE 영향:
+
+     어떤 ACQUIRE 오퍼레이션보다 앞에서 요청된 모든 ACQUIRE 오퍼레이션은 그
+     ACQUIRE 오퍼레이션 전에 완료됩니다.
+
+ (4) ACQUIRE vs RELEASE implication:
+
+     어떤 RELEASE 오퍼레이션보다 앞서 요청된 ACQUIRE 오퍼레이션은 그 RELEASE
+     오퍼레이션보다 먼저 완료됩니다.
+
+ (5) 실패한 조건적 ACQUIRE 영향:
+
+     ACQUIRE 오퍼레이션의 일부 락(lock) 변종은 락이 곧바로 획득하기에는
+     불가능한 상태이거나 락이 획득 가능해지도록 기다리는 도중 시그널을 받거나
+     해서 실패할 수 있습니다.  실패한 락은 어떤 배리어도 내포하지 않습니다.
+
+[!] 참고: 락 ACQUIRE 와 RELEASE 가 단방향 배리어여서 나타나는 현상 중 하나는
+크리티컬 섹션 바깥의 인스트럭션의 영향이 크리티컬 섹션 내부로도 들어올 수
+있다는 것입니다.
+
+RELEASE 후에 요청되는 ACQUIRE 는 전체 메모리 배리어라 여겨지면 안되는데,
+ACQUIRE 앞의 액세스가 ACQUIRE 후에 수행될 수 있고, RELEASE 후의 액세스가
+RELEASE 전에 수행될 수도 있으며, 그 두개의 액세스가 서로를 지나칠 수도 있기
+때문입니다:
+
+       *A = a;
+       ACQUIRE M
+       RELEASE M
+       *B = b;
+
+는 다음과 같이 될 수도 있습니다:
+
+       ACQUIRE M, STORE *B, STORE *A, RELEASE M
+
+ACQUIRE 와 RELEASE 가 락 획득과 해제라면, 그리고 락의 ACQUIRE 와 RELEASE 가
+같은 락 변수에 대한 것이라면, 해당 락을 쥐고 있지 않은 다른 CPU 의 시야에는
+이와 같은 재배치가 일어나는 것으로 보일 수 있습니다.  요약하자면, ACQUIRE 에
+이어 RELEASE 오퍼레이션을 순차적으로 실행하는 행위가 전체 메모리 배리어로
+생각되어선 -안됩니다-.
+
+비슷하게, 앞의 반대 케이스인 RELEASE 와 ACQUIRE 두개 오퍼레이션의 순차적 실행
+역시 전체 메모리 배리어를 내포하지 않습니다.  따라서, RELEASE, ACQUIRE 로
+규정되는 크리티컬 섹션의 CPU 수행은 RELEASE 와 ACQUIRE 를 가로지를 수 있으므로,
+다음과 같은 코드는:
+
+       *A = a;
+       RELEASE M
+       ACQUIRE N
+       *B = b;
+
+다음과 같이 수행될 수 있습니다:
+
+       ACQUIRE N, STORE *B, STORE *A, RELEASE M
+
+이런 재배치는 데드락을 일으킬 수도 있을 것처럼 보일 수 있습니다.  하지만, 그런
+데드락의 조짐이 있다면 RELEASE 는 단순히 완료될 것이므로 데드락은 존재할 수
+없습니다.
+
+       이게 어떻게 올바른 동작을 할 수 있을까요?
+
+       우리가 이야기 하고 있는건 재배치를 하는 CPU 에 대한 이야기이지,
+       컴파일러에 대한 것이 아니란 점이 핵심입니다.  컴파일러 (또는, 개발자)
+       가 오퍼레이션들을 이렇게 재배치하면, 데드락이 일어날 수 -있습-니다.
+
+       하지만 CPU 가 오퍼레이션들을 재배치 했다는걸 생각해 보세요.  이 예에서,
+       어셈블리 코드 상으로는 언락이 락을 앞서게 되어 있습니다.  CPU 가 이를
+       재배치해서 뒤의 락 오퍼레이션을 먼저 실행하게 됩니다.  만약 데드락이
+       존재한다면, 이 락 오퍼레이션은 그저 스핀을 하며 계속해서 락을
+       시도합니다 (또는, 한참 후에겠지만, 잠듭니다).  CPU 는 언젠가는
+       (어셈블리 코드에서는 락을 앞서는) 언락 오퍼레이션을 실행하는데, 이 언락
+       오퍼레이션이 잠재적 데드락을 해결하고, 락 오퍼레이션도 뒤이어 성공하게
+       됩니다.
+
+       하지만 만약 락이 잠을 자는 타입이었다면요?  그런 경우에 코드는
+       스케쥴러로 들어가려 할 거고, 여기서 결국은 메모리 배리어를 만나게
+       되는데, 이 메모리 배리어는 앞의 언락 오퍼레이션이 완료되도록 만들고,
+       데드락은 이번에도 해결됩니다.  잠을 자는 행위와 언락 사이의 경주 상황
+       (race) 도 있을 수 있겠습니다만, 락 관련 기능들은 그런 경주 상황을 모든
+       경우에 제대로 해결할 수 있어야 합니다.
+
+락과 세마포어는 UP 컴파일된 시스템에서의 순서에 대해 보장을 하지 않기 때문에,
+그런 상황에서 인터럽트 비활성화 오퍼레이션과 함께가 아니라면 어떤 일에도 - 특히
+I/O 액세스와 관련해서는 - 제대로 사용될 수 없을 겁니다.
+
+"CPU 간 ACQUIRING 배리어 효과" 섹션도 참고하시기 바랍니다.
+
+
+예를 들어, 다음과 같은 코드를 생각해 봅시다:
+
+       *A = a;
+       *B = b;
+       ACQUIRE
+       *C = c;
+       *D = d;
+       RELEASE
+       *E = e;
+       *F = f;
+
+여기선 다음의 이벤트 시퀀스가 생길 수 있습니다:
+
+       ACQUIRE, {*F,*A}, *E, {*C,*D}, *B, RELEASE
+
+       [+] {*F,*A} 는 조합된 액세스를 의미합니다.
+
+하지만 다음과 같은 건 불가능하죠:
+
+       {*F,*A}, *B,    ACQUIRE, *C, *D,        RELEASE, *E
+       *A, *B, *C,     ACQUIRE, *D,            RELEASE, *E, *F
+       *A, *B,         ACQUIRE, *C,            RELEASE, *D, *E, *F
+       *B,             ACQUIRE, *C, *D,        RELEASE, {*F,*A}, *E
+
+
+
+인터럽트 비활성화 함수
+----------------------
+
+인터럽트를 비활성화 하는 함수 (ACQUIRE 와 동일) 와 인터럽트를 활성화 하는 함수
+(RELEASE 와 동일) 는 컴파일러 배리어처럼만 동작합니다.  따라서, 별도의 메모리
+배리어나 I/O 배리어가 필요한 상황이라면 그 배리어들은 인터럽트 비활성화 함수
+외의 방법으로 제공되어야만 합니다.
+
+
+슬립과 웨이크업 함수
+--------------------
+
+글로벌 데이터에 표시된 이벤트에 의해 프로세스를 잠에 빠트리는 것과 깨우는 것은
+해당 이벤트를 기다리는 태스크의 태스크 상태와 그 이벤트를 알리기 위해 사용되는
+글로벌 데이터, 두 데이터간의 상호작용으로 볼 수 있습니다.  이것이 옳은 순서대로
+일어남을 분명히 하기 위해, 프로세스를 잠에 들게 하는 기능과 깨우는 기능은
+몇가지 배리어를 내포합니다.
+
+먼저, 잠을 재우는 쪽은 일반적으로 다음과 같은 이벤트 시퀀스를 따릅니다:
+
+       for (;;) {
+               set_current_state(TASK_UNINTERRUPTIBLE);
+               if (event_indicated)
+                       break;
+               schedule();
+       }
+
+set_current_state() 에 의해, 태스크 상태가 바뀐 후 범용 메모리 배리어가
+자동으로 삽입됩니다:
+
+       CPU 1
+       ===============================
+       set_current_state();
+         smp_store_mb();
+           STORE current->state
+           <범용 배리어>
+       LOAD event_indicated
+
+set_current_state() 는 다음의 것들로 감싸질 수도 있습니다:
+
+       prepare_to_wait();
+       prepare_to_wait_exclusive();
+
+이것들 역시 상태를 설정한 후 범용 메모리 배리어를 삽입합니다.
+앞의 전체 시퀀스는 다음과 같은 함수들로 한번에 수행 가능한데, 이것들은 모두
+올바른 장소에 메모리 배리어를 삽입합니다:
+
+       wait_event();
+       wait_event_interruptible();
+       wait_event_interruptible_exclusive();
+       wait_event_interruptible_timeout();
+       wait_event_killable();
+       wait_event_timeout();
+       wait_on_bit();
+       wait_on_bit_lock();
+
+
+두번째로, 깨우기를 수행하는 코드는 일반적으로 다음과 같을 겁니다:
+
+       event_indicated = 1;
+       wake_up(&event_wait_queue);
+
+또는:
+
+       event_indicated = 1;
+       wake_up_process(event_daemon);
+
+wake_up() 류에 의해 쓰기 메모리 배리어가 내포됩니다.  만약 그것들이 뭔가를
+깨운다면요.  이 배리어는 태스크 상태가 지워지기 전에 수행되므로, 이벤트를
+알리기 위한 STORE 와 태스크 상태를 TASK_RUNNING 으로 설정하는 STORE 사이에
+위치하게 됩니다.
+
+       CPU 1                           CPU 2
+       =============================== ===============================
+       set_current_state();            STORE event_indicated
+         smp_store_mb();               wake_up();
+           STORE current->state          <쓰기 배리어>
+           <범용 배리어>            STORE current->state
+       LOAD event_indicated
+
+한번더 말합니다만, 이 쓰기 메모리 배리어는 이 코드가 정말로 뭔가를 깨울 때에만
+실행됩니다.  이걸 설명하기 위해, X 와 Y 는 모두 0 으로 초기화 되어 있다는 가정
+하에 아래의 이벤트 시퀀스를 생각해 봅시다:
+
+       CPU 1                           CPU 2
+       =============================== ===============================
+       X = 1;                          STORE event_indicated
+       smp_mb();                       wake_up();
+       Y = 1;                          wait_event(wq, Y == 1);
+       wake_up();                        load from Y sees 1, no memory barrier
+                                       load from X might see 0
+
+위 예제에서의 경우와 달리 깨우기가 정말로 행해졌다면, CPU 2 의 X 로드는 1 을
+본다고 보장될 수 있을 겁니다.
+
+사용 가능한 깨우기류 함수들로 다음과 같은 것들이 있습니다:
+
+       complete();
+       wake_up();
+       wake_up_all();
+       wake_up_bit();
+       wake_up_interruptible();
+       wake_up_interruptible_all();
+       wake_up_interruptible_nr();
+       wake_up_interruptible_poll();
+       wake_up_interruptible_sync();
+       wake_up_interruptible_sync_poll();
+       wake_up_locked();
+       wake_up_locked_poll();
+       wake_up_nr();
+       wake_up_poll();
+       wake_up_process();
+
+
+[!] 잠재우는 코드와 깨우는 코드에 내포되는 메모리 배리어들은 깨우기 전에
+이루어진 스토어를 잠재우는 코드가 set_current_state() 를 호출한 후에 행하는
+로드에 대해 순서를 맞추지 _않는다는_ 점을 기억하세요.  예를 들어, 잠재우는
+코드가 다음과 같고:
+
+       set_current_state(TASK_INTERRUPTIBLE);
+       if (event_indicated)
+               break;
+       __set_current_state(TASK_RUNNING);
+       do_something(my_data);
+
+깨우는 코드는 다음과 같다면:
+
+       my_data = value;
+       event_indicated = 1;
+       wake_up(&event_wait_queue);
+
+event_indecated 에의 변경이 잠재우는 코드에게 my_data 에의 변경 후에 이루어진
+것으로 인지될 것이라는 보장이 없습니다.  이런 경우에는 양쪽 코드 모두 각각의
+데이터 액세스 사이에 메모리 배리어를 직접 쳐야 합니다.  따라서 앞의 재우는
+코드는 다음과 같이:
+
+       set_current_state(TASK_INTERRUPTIBLE);
+       if (event_indicated) {
+               smp_rmb();
+               do_something(my_data);
+       }
+
+그리고 깨우는 코드는 다음과 같이 되어야 합니다:
+
+       my_data = value;
+       smp_wmb();
+       event_indicated = 1;
+       wake_up(&event_wait_queue);
+
+
+그외의 함수들
+-------------
+
+그외의 배리어를 내포하는 함수들은 다음과 같습니다:
+
+ (*) schedule() 과 그 유사한 것들이 완전한 메모리 배리어를 내포합니다.
+
+
+==============================
+CPU 간 ACQUIRING 배리어의 효과
+==============================
+
+SMP 시스템에서의 락 기능들은 더욱 강력한 형태의 배리어를 제공합니다: 이
+배리어는 동일한 락을 사용하는 다른 CPU 들의 메모리 액세스 순서에도 영향을
+끼칩니다.
+
+
+ACQUIRE VS 메모리 액세스
+------------------------
+
+다음의 예를 생각해 봅시다: 시스템은 두개의 스핀락 (M) 과 (Q), 그리고 세개의 CPU
+를 가지고 있습니다; 여기에 다음의 이벤트 시퀀스가 발생합니다:
+
+       CPU 1                           CPU 2
+       =============================== ===============================
+       WRITE_ONCE(*A, a);              WRITE_ONCE(*E, e);
+       ACQUIRE M                       ACQUIRE Q
+       WRITE_ONCE(*B, b);              WRITE_ONCE(*F, f);
+       WRITE_ONCE(*C, c);              WRITE_ONCE(*G, g);
+       RELEASE M                       RELEASE Q
+       WRITE_ONCE(*D, d);              WRITE_ONCE(*H, h);
+
+*A 로의 액세스부터 *H 로의 액세스까지가 어떤 순서로 CPU 3 에게 보여질지에
+대해서는 각 CPU 에서의 락 사용에 의해 내포되어 있는 제약을 제외하고는 어떤
+보장도 존재하지 않습니다.  예를 들어, CPU 3 에게 다음과 같은 순서로 보여지는
+것이 가능합니다:
+
+       *E, ACQUIRE M, ACQUIRE Q, *G, *C, *F, *A, *B, RELEASE Q, *D, *H, RELEASE M
+
+하지만 다음과 같이 보이지는 않을 겁니다:
+
+       *B, *C or *D preceding ACQUIRE M
+       *A, *B or *C following RELEASE M
+       *F, *G or *H preceding ACQUIRE Q
+       *E, *F or *G following RELEASE Q
+
+
+
+ACQUIRE VS I/O 액세스
+----------------------
+
+특정한 (특히 NUMA 가 관련된) 환경 하에서 두개의 CPU 에서 동일한 스핀락으로
+보호되는 두개의 크리티컬 섹션 안의 I/O 액세스는 PCI 브릿지에 겹쳐진 I/O
+액세스로 보일 수 있는데, PCI 브릿지는 캐시 일관성 프로토콜과 합을 맞춰야 할
+의무가 없으므로, 필요한 읽기 메모리 배리어가 요청되지 않기 때문입니다.
+
+예를 들어서:
+
+       CPU 1                           CPU 2
+       =============================== ===============================
+       spin_lock(Q)
+       writel(0, ADDR)
+       writel(1, DATA);
+       spin_unlock(Q);
+                                       spin_lock(Q);
+                                       writel(4, ADDR);
+                                       writel(5, DATA);
+                                       spin_unlock(Q);
+
+는 PCI 브릿지에 다음과 같이 보일 수 있습니다:
+
+       STORE *ADDR = 0, STORE *ADDR = 4, STORE *DATA = 1, STORE *DATA = 5
+
+이렇게 되면 하드웨어의 오동작을 일으킬 수 있습니다.
+
+
+이런 경우엔 잡아둔 스핀락을 내려놓기 전에 mmiowb() 를 수행해야 하는데, 예를
+들면 다음과 같습니다:
+
+       CPU 1                           CPU 2
+       =============================== ===============================
+       spin_lock(Q)
+       writel(0, ADDR)
+       writel(1, DATA);
+       mmiowb();
+       spin_unlock(Q);
+                                       spin_lock(Q);
+                                       writel(4, ADDR);
+                                       writel(5, DATA);
+                                       mmiowb();
+                                       spin_unlock(Q);
+
+이 코드는 CPU 1 에서 요청된 두개의 스토어가 PCI 브릿지에 CPU 2 에서 요청된
+스토어들보다 먼저 보여짐을 보장합니다.
+
+
+또한, 같은 디바이스에서 스토어를 이어 로드가 수행되면 이 로드는 로드가 수행되기
+전에 스토어가 완료되기를 강제하므로 mmiowb() 의 필요가 없어집니다:
+
+       CPU 1                           CPU 2
+       =============================== ===============================
+       spin_lock(Q)
+       writel(0, ADDR)
+       a = readl(DATA);
+       spin_unlock(Q);
+                                       spin_lock(Q);
+                                       writel(4, ADDR);
+                                       b = readl(DATA);
+                                       spin_unlock(Q);
+
+
+더 많은 정보를 위해선 Documenataion/DocBook/deviceiobook.tmpl 을 참고하세요.
+
+
+=========================
+메모리 배리어가 필요한 곳
+=========================
+
+설령 SMP 커널을 사용하더라도 싱글 쓰레드로 동작하는 코드는 올바르게 동작하는
+것으로 보여질 것이기 때문에, 평범한 시스템 운영중에 메모리 오퍼레이션 재배치는
+일반적으로 문제가 되지 않습니다.  하지만, 재배치가 문제가 _될 수 있는_ 네가지
+환경이 있습니다:
+
+ (*) 프로세서간 상호 작용.
+
+ (*) 어토믹 오퍼레이션.
+
+ (*) 디바이스 액세스.
+
+ (*) 인터럽트.
+
+
+프로세서간 상호 작용
+--------------------
+
+두개 이상의 프로세서를 가진 시스템이 있다면, 시스템의 두개 이상의 CPU 는 동시에
+같은 데이터에 대한 작업을 할 수 있습니다.  이는 동기화 문제를 일으킬 수 있고,
+이 문제를 해결하는 일반적 방법은 락을 사용하는 것입니다.  하지만, 락은 상당히
+비용이 비싸서 가능하면 락을 사용하지 않고 일을 처리하는 것이 낫습니다.  이런
+경우, 두 CPU 모두에 영향을 끼치는 오퍼레이션들은 오동작을 막기 위해 신중하게
+순서가 맞춰져야 합니다.
+
+예를 들어, R/W 세마포어의 느린 수행경로 (slow path) 를 생각해 봅시다.
+세마포어를 위해 대기를 하는 하나의 프로세스가 자신의 스택 중 일부를 이
+세마포어의 대기 프로세스 리스트에 링크한 채로 있습니다:
+
+       struct rw_semaphore {
+               ...
+               spinlock_t lock;
+               struct list_head waiters;
+       };
+
+       struct rwsem_waiter {
+               struct list_head list;
+               struct task_struct *task;
+       };
+
+특정 대기 상태 프로세스를 깨우기 위해, up_read() 나 up_write() 함수는 다음과
+같은 일을 합니다:
+
+ (1) 다음 대기 상태 프로세스 레코드는 어디있는지 알기 위해 이 대기 상태
+     프로세스 레코드의 next 포인터를 읽습니다;
+
+ (2) 이 대기 상태 프로세스의 task 구조체로의 포인터를 읽습니다;
+
+ (3) 이 대기 상태 프로세스가 세마포어를 획득했음을 알리기 위해 task
+     포인터를 초기화 합니다;
+
+ (4) 해당 태스크에 대해 wake_up_process() 를 호출합니다; 그리고
+
+ (5) 해당 대기 상태 프로세스의 task 구조체를 잡고 있던 레퍼런스를 해제합니다.
+
+달리 말하자면, 다음 이벤트 시퀀스를 수행해야 합니다:
+
+       LOAD waiter->list.next;
+       LOAD waiter->task;
+       STORE waiter->task;
+       CALL wakeup
+       RELEASE task
+
+그리고 이 이벤트들이 다른 순서로 수행된다면, 오동작이 일어날 수 있습니다.
+
+한번 세마포어의 대기줄에 들어갔고 세마포어 락을 놓았다면, 해당 대기 프로세스는
+락을 다시는 잡지 않습니다; 대신 자신의 task 포인터가 초기화 되길 기다립니다.
+그 레코드는 대기 프로세스의 스택에 있기 때문에, 리스트의 next 포인터가 읽혀지기
+_전에_ task 포인터가 지워진다면, 다른 CPU 는 해당 대기 프로세스를 시작해 버리고
+up*() 함수가 next 포인터를 읽기 전에 대기 프로세스의 스택을 마구 건드릴 수
+있습니다.
+
+그렇게 되면 위의 이벤트 시퀀스에 어떤 일이 일어나는지 생각해 보죠:
+
+       CPU 1                           CPU 2
+       =============================== ===============================
+                                       down_xxx()
+                                       Queue waiter
+                                       Sleep
+       up_yyy()
+       LOAD waiter->task;
+       STORE waiter->task;
+                                       Woken up by other event
+       <preempt>
+                                       Resume processing
+                                       down_xxx() returns
+                                       call foo()
+                                       foo() clobbers *waiter
+       </preempt>
+       LOAD waiter->list.next;
+       --- OOPS ---
+
+이 문제는 세마포어 락의 사용으로 해결될 수도 있겠지만, 그렇게 되면 깨어난 후에
+down_xxx() 함수가 불필요하게 스핀락을 또다시 얻어야만 합니다.
+
+이 문제를 해결하는 방법은 범용 SMP 메모리 배리어를 추가하는 겁니다:
+
+       LOAD waiter->list.next;
+       LOAD waiter->task;
+       smp_mb();
+       STORE waiter->task;
+       CALL wakeup
+       RELEASE task
+
+이 경우에, 배리어는 시스템의 나머지 CPU 들에게 모든 배리어 앞의 메모리 액세스가
+배리어 뒤의 메모리 액세스보다 앞서 일어난 것으로 보이게 만듭니다.  배리어 앞의
+메모리 액세스들이 배리어 명령 자체가 완료되는 시점까지 완료된다고는 보장하지
+_않습니다_.
+
+(이게 문제가 되지 않을) 단일 프로세서 시스템에서 smp_mb() 는 실제로는 그저
+컴파일러가 CPU 안에서의 순서를 바꾸거나 하지 않고 주어진 순서대로 명령을
+내리도록 하는 컴파일러 배리어일 뿐입니다.  오직 하나의 CPU 만 있으니, CPU 의
+의존성 순서 로직이 그 외의 모든것을 알아서 처리할 겁니다.
+
+
+어토믹 오퍼레이션
+-----------------
+
+어토믹 오퍼레이션은 기술적으로 프로세서간 상호작용으로 분류되며 그 중 일부는
+전체 메모리 배리어를 내포하고 또 일부는 내포하지 않지만, 커널에서 상당히
+의존적으로 사용하는 기능 중 하나입니다.
+
+메모리의 어떤 상태를 수정하고 해당 상태에 대한 (예전의 또는 최신의) 정보를
+리턴하는 어토믹 오퍼레이션은 모두 SMP-조건적 범용 메모리 배리어(smp_mb())를
+실제 오퍼레이션의 앞과 뒤에 내포합니다.  이런 오퍼레이션은 다음의 것들을
+포함합니다:
+
+       xchg();
+       atomic_xchg();                  atomic_long_xchg();
+       atomic_inc_return();            atomic_long_inc_return();
+       atomic_dec_return();            atomic_long_dec_return();
+       atomic_add_return();            atomic_long_add_return();
+       atomic_sub_return();            atomic_long_sub_return();
+       atomic_inc_and_test();          atomic_long_inc_and_test();
+       atomic_dec_and_test();          atomic_long_dec_and_test();
+       atomic_sub_and_test();          atomic_long_sub_and_test();
+       atomic_add_negative();          atomic_long_add_negative();
+       test_and_set_bit();
+       test_and_clear_bit();
+       test_and_change_bit();
+
+       /* exchange 조건이 성공할 때 */
+       cmpxchg();
+       atomic_cmpxchg();               atomic_long_cmpxchg();
+       atomic_add_unless();            atomic_long_add_unless();
+
+이것들은 메모리 배리어 효과가 필요한 ACQUIRE 부류와 RELEASE 부류 오퍼레이션들을
+구현할 때, 그리고 객체 해제를 위해 레퍼런스 카운터를 조정할 때, 암묵적 메모리
+배리어 효과가 필요한 곳 등에 사용됩니다.
+
+
+다음의 오퍼레이션들은 메모리 배리어를 내포하지 _않기_ 때문에 문제가 될 수
+있지만, RELEASE 부류의 오퍼레이션들과 같은 것들을 구현할 때 사용될 수도
+있습니다:
+
+       atomic_set();
+       set_bit();
+       clear_bit();
+       change_bit();
+
+이것들을 사용할 때에는 필요하다면 적절한 (예를 들면 smp_mb__before_atomic()
+같은) 메모리 배리어가 명시적으로 함께 사용되어야 합니다.
+
+
+아래의 것들도 메모리 배리어를 내포하지 _않기_ 때문에, 일부 환경에서는 (예를
+들면 smp_mb__before_atomic() 과 같은) 명시적인 메모리 배리어 사용이 필요합니다.
+
+       atomic_add();
+       atomic_sub();
+       atomic_inc();
+       atomic_dec();
+
+이것들이 통계 생성을 위해 사용된다면, 그리고 통계 데이터 사이에 관계가 존재하지
+않는다면 메모리 배리어는 필요치 않을 겁니다.
+
+객체의 수명을 관리하기 위해 레퍼런스 카운팅 목적으로 사용된다면, 레퍼런스
+카운터는 락으로 보호되는 섹션에서만 조정되거나 호출하는 쪽이 이미 충분한
+레퍼런스를 잡고 있을 것이기 때문에 메모리 배리어는 아마 필요 없을 겁니다.
+
+만약 어떤 락을 구성하기 위해 사용된다면, 락 관련 동작은 일반적으로 작업을 특정
+순서대로 진행해야 하므로 메모리 배리어가 필요할 수 있습니다.
+
+기본적으로, 각 사용처에서는 메모리 배리어가 필요한지 아닌지 충분히 고려해야
+합니다.
+
+아래의 오퍼레이션들은 특별한 락 관련 동작들입니다:
+
+       test_and_set_bit_lock();
+       clear_bit_unlock();
+       __clear_bit_unlock();
+
+이것들은 ACQUIRE 류와 RELEASE 류의 오퍼레이션들을 구현합니다.  락 관련 도구를
+구현할 때에는 이것들을 좀 더 선호하는 편이 나은데, 이것들의 구현은 많은
+아키텍쳐에서 최적화 될 수 있기 때문입니다.
+
+[!] 이런 상황에 사용할 수 있는 특수한 메모리 배리어 도구들이 있습니다만, 일부
+CPU 에서는 사용되는 어토믹 인스트럭션 자체에 메모리 배리어가 내포되어 있어서
+어토믹 오퍼레이션과 메모리 배리어를 함께 사용하는 게 불필요한 일이 될 수
+있는데, 그런 경우에 이 특수 메모리 배리어 도구들은 no-op 이 되어 실질적으로
+아무일도 하지 않습니다.
+
+더 많은 내용을 위해선 Documentation/atomic_ops.txt 를 참고하세요.
+
+
+디바이스 액세스
+---------------
+
+많은 디바이스가 메모리 매핑 기법으로 제어될 수 있는데, 그렇게 제어되는
+디바이스는 CPU 에는 단지 특정 메모리 영역의 집합처럼 보이게 됩니다.  드라이버는
+그런 디바이스를 제어하기 위해 정확히 올바른 순서로 올바른 메모리 액세스를
+만들어야 합니다.
+
+하지만, 액세스들을 재배치 하거나 조합하거나 병합하는게 더 효율적이라 판단하는
+영리한 CPU 나 컴파일러들을 사용하면 드라이버 코드의 조심스럽게 순서 맞춰진
+액세스들이 디바이스에는 요청된 순서대로 도착하지 못하게 할 수 있는 - 디바이스가
+오동작을 하게 할 - 잠재적 문제가 생길 수 있습니다.
+
+리눅스 커널 내부에서, I/O 는 어떻게 액세스들을 적절히 순차적이게 만들 수 있는지
+알고 있는, - inb() 나 writel() 과 같은 - 적절한 액세스 루틴을 통해 이루어져야만
+합니다.  이것들은 대부분의 경우에는 명시적 메모리 배리어 와 함께 사용될 필요가
+없습니다만, 다음의 두가지 상황에서는 명시적 메모리 배리어가 필요할 수 있습니다:
+
+ (1) 일부 시스템에서 I/O 스토어는 모든 CPU 에 일관되게 순서 맞춰지지 않는데,
+     따라서 _모든_ 일반적인 드라이버들에 락이 사용되어야만 하고 이 크리티컬
+     섹션을 빠져나오기 전에 mmiowb() 가 꼭 호출되어야 합니다.
+
+ (2) 만약 액세스 함수들이 완화된 메모리 액세스 속성을 갖는 I/O 메모리 윈도우를
+     사용한다면, 순서를 강제하기 위해선 _mandatory_ 메모리 배리어가 필요합니다.
+
+더 많은 정보를 위해선 Documentation/DocBook/deviceiobook.tmpl 을 참고하십시오.
+
+
+인터럽트
+--------
+
+드라이버는 자신의 인터럽트 서비스 루틴에 의해 인터럽트 당할 수 있기 때문에
+드라이버의 이 두 부분은 서로의 디바이스 제어 또는 액세스 부분과 상호 간섭할 수
+있습니다.
+
+스스로에게 인터럽트 당하는 걸 불가능하게 하고, 드라이버의 크리티컬한
+오퍼레이션들을 모두 인터럽트가 불가능하게 된 영역에 집어넣거나 하는 방법 (락의
+한 형태) 으로 이런 상호 간섭을 - 최소한 부분적으로라도 - 줄일 수 있습니다.
+드라이버의 인터럽트 루틴이 실행 중인 동안, 해당 드라이버의 코어는 같은 CPU 에서
+수행되지 않을 것이며, 현재의 인터럽트가 처리되는 중에는 또다시 인터럽트가
+일어나지 못하도록 되어 있으니 인터럽트 핸들러는 그에 대해서는 락을 잡지 않아도
+됩니다.
+
+하지만, 어드레스 레지스터와 데이터 레지스터를 갖는 이더넷 카드를 다루는
+드라이버를 생각해 봅시다.  만약 이 드라이버의 코어가 인터럽트를 비활성화시킨
+채로 이더넷 카드와 대화하고 드라이버의 인터럽트 핸들러가 호출되었다면:
+
+       LOCAL IRQ DISABLE
+       writew(ADDR, 3);
+       writew(DATA, y);
+       LOCAL IRQ ENABLE
+       <interrupt>
+       writew(ADDR, 4);
+       q = readw(DATA);
+       </interrupt>
+
+만약 순서 규칙이 충분히 완화되어 있다면 데이터 레지스터에의 스토어는 어드레스
+레지스터에 두번째로 행해지는 스토어 뒤에 일어날 수도 있습니다:
+
+       STORE *ADDR = 3, STORE *ADDR = 4, STORE *DATA = y, q = LOAD *DATA
+
+
+만약 순서 규칙이 충분히 완화되어 있고 묵시적으로든 명시적으로든 배리어가
+사용되지 않았다면 인터럽트 비활성화 섹션에서 일어난 액세스가 바깥으로 새어서
+인터럽트 내에서 일어난 액세스와 섞일 수 있다고 - 그리고 그 반대도 - 가정해야만
+합니다.
+
+그런 영역 안에서 일어나는 I/O 액세스들은 엄격한 순서 규칙의 I/O 레지스터에
+묵시적 I/O 배리어를 형성하는 동기적 (synchronous) 로드 오퍼레이션을 포함하기
+때문에 일반적으로는 이런게 문제가 되지 않습니다.  만약 이걸로는 충분치 않다면
+mmiowb() 가 명시적으로 사용될 필요가 있습니다.
+
+
+하나의 인터럽트 루틴과 별도의 CPU 에서 수행중이며 서로 통신을 하는 두 루틴
+사이에도 비슷한 상황이 일어날 수 있습니다.  만약 그런 경우가 발생할 가능성이
+있다면, 순서를 보장하기 위해 인터럽트 비활성화 락이 사용되어져야만 합니다.
+
+
+======================
+커널 I/O 배리어의 효과
+======================
+
+I/O 메모리에 액세스할 때, 드라이버는 적절한 액세스 함수를 사용해야 합니다:
+
+ (*) inX(), outX():
+
+     이것들은 메모리 공간보다는 I/O 공간에 이야기를 하려는 의도로
+     만들어졌습니다만, 그건 기본적으로 CPU 마다 다른 컨셉입니다.  i386 과
+     x86_64 프로세서들은 특별한 I/O 공간 액세스 사이클과 명령어를 실제로 가지고
+     있지만, 다른 많은 CPU 들에는 그런 컨셉이 존재하지 않습니다.
+
+     다른 것들 중에서도 PCI 버스가 I/O 공간 컨셉을 정의하는데, 이는 - i386 과
+     x86_64 같은 CPU 에서 - CPU 의 I/O 공간 컨셉으로 쉽게 매치됩니다.  하지만,
+     대체할 I/O 공간이 없는 CPU 에서는 CPU 의 메모리 맵의 가상 I/O 공간으로
+     매핑될 수도 있습니다.
+
+     이 공간으로의 액세스는 (i386 등에서는) 완전하게 동기화 됩니다만, 중간의
+     (PCI 호스트 브리지와 같은) 브리지들은 이를 완전히 보장하진 않을수도
+     있습니다.
+
+     이것들의 상호간의 순서는 완전하게 보장됩니다.
+
+     다른 타입의 메모리 오퍼레이션, I/O 오퍼레이션에 대한 순서는 완전하게
+     보장되지는 않습니다.
+
+ (*) readX(), writeX():
+
+     이것들이 수행 요청되는 CPU 에서 서로에게 완전히 순서가 맞춰지고 독립적으로
+     수행되는지에 대한 보장 여부는 이들이 액세스 하는 메모리 윈도우에 정의된
+     특성에 의해 결정됩니다.  예를 들어, 최신의 i386 아키텍쳐 머신에서는 MTRR
+     레지스터로 이 특성이 조정됩니다.
+
+     일반적으로는, 프리페치 (prefetch) 가능한 디바이스를 액세스 하는게
+     아니라면, 이것들은 완전히 순서가 맞춰지고 결합되지 않게 보장될 겁니다.
+
+     하지만, (PCI 브리지와 같은) 중간의 하드웨어는 자신이 원한다면 집행을
+     연기시킬 수 있습니다; 스토어 명령을 실제로 하드웨어로 내려보내기(flush)
+     위해서는 같은 위치로부터 로드를 하는 방법이 있습니다만[*], PCI 의 경우는
+     같은 디바이스나 환경 구성 영역에서의 로드만으로도 충분할 겁니다.
+
+     [*] 주의! 쓰여진 것과 같은 위치로부터의 로드를 시도하는 것은 오동작을
+        일으킬 수도 있습니다 - 예로 16650 Rx/Tx 시리얼 레지스터를 생각해
+        보세요.
+
+     프리페치 가능한 I/O 메모리가 사용되면, 스토어 명령들이 순서를 지키도록
+     하기 위해 mmiowb() 배리어가 필요할 수 있습니다.
+
+     PCI 트랜잭션 사이의 상호작용에 대해 더 많은 정보를 위해선 PCI 명세서를
+     참고하시기 바랍니다.
+
+ (*) readX_relaxed(), writeX_relaxed()
+
+     이것들은 readX() 와 writeX() 랑 비슷하지만, 더 완화된 메모리 순서 보장을
+     제공합니다.  구체적으로, 이것들은 일반적 메모리 액세스 (예: DMA 버퍼) 에도
+     LOCK 이나 UNLOCK 오퍼레이션들에도 순서를 보장하지 않습니다.  LOCK 이나
+     UNLOCK 오퍼레이션들에 맞춰지는 순서가 필요하다면, mmiowb() 배리어가 사용될
+     수 있습니다.  같은 주변 장치에의 완화된 액세스끼리는 순서가 지켜짐을 알아
+     두시기 바랍니다.
+
+ (*) ioreadX(), iowriteX()
+
+     이것들은 inX()/outX() 나 readX()/writeX() 처럼 실제로 수행하는 액세스의
+     종류에 따라 적절하게 수행될 것입니다.
+
+
+===================================
+가정되는 가장 완화된 실행 순서 모델
+===================================
+
+컨셉적으로 CPU 는 주어진 프로그램에 대해 프로그램 그 자체에는 인과성 (program
+causality) 을 지키는 것처럼 보이게 하지만 일반적으로는 순서를 거의 지켜주지
+않는다고 가정되어야만 합니다.  (i386 이나 x86_64 같은) 일부 CPU 들은 코드
+재배치에 (powerpc 나 frv 와 같은) 다른 것들에 비해 강한 제약을 갖지만, 아키텍쳐
+종속적 코드 이외의 코드에서는 순서에 대한 제약이 가장 완화된 경우 (DEC Alpha)
+를 가정해야 합니다.
+
+이 말은, CPU 에게 주어지는 인스트럭션 스트림 내의 한 인스트럭션이 앞의
+인스트럭션에 종속적이라면 앞의 인스트럭션은 뒤의 종속적 인스트럭션이 실행되기
+전에 완료[*]될 수 있어야 한다는 제약 (달리 말해서, 인과성이 지켜지는 것으로
+보이게 함) 외에는 자신이 원하는 순서대로 - 심지어 병렬적으로도 - 그 스트림을
+실행할 수 있음을 의미합니다
+
+ [*] 일부 인스트럭션은 하나 이상의 영향 - 조건 코드를 바꾼다던지, 레지스터나
+     메모리를 바꾼다던지 - 을 만들어내며, 다른 인스트럭션은 다른 효과에
+     종속적일 수 있습니다.
+
+CPU 는 최종적으로 아무 효과도 만들지 않는 인스트럭션 시퀀스는 없애버릴 수도
+있습니다.  예를 들어, 만약 두개의 연속되는 인스트럭션이 둘 다 같은 레지스터에
+직접적인 값 (immediate value) 을 집어넣는다면, 첫번째 인스트럭션은 버려질 수도
+있습니다.
+
+
+비슷하게, 컴파일러 역시 프로그램의 인과성만 지켜준다면 인스트럭션 스트림을
+자신이 보기에 올바르다 생각되는대로 재배치 할 수 있습니다.
+
+
+===============
+CPU 캐시의 영향
+===============
+
+캐시된 메모리 오퍼레이션들이 시스템 전체에 어떻게 인지되는지는 CPU 와 메모리
+사이에 존재하는 캐시들, 그리고 시스템 상태의 일관성을 관리하는 메모리 일관성
+시스템에 상당 부분 영향을 받습니다.
+
+한 CPU 가 시스템의 다른 부분들과 캐시를 통해 상호작용한다면, 메모리 시스템은
+CPU 의 캐시들을 포함해야 하며, CPU 와 CPU 자신의 캐시 사이에서의 동작을 위한
+메모리 배리어를 가져야 합니다. (메모리 배리어는 논리적으로는 다음 그림의
+점선에서 동작합니다):
+
+           <--- CPU --->         :       <----------- Memory ----------->
+                                 :
+       +--------+    +--------+  :   +--------+    +-----------+
+       |        |    |        |  :   |        |    |           |    +--------+
+       |  CPU   |    | Memory |  :   | CPU    |    |           |    |        |
+       |  Core  |--->| Access |----->| Cache  |<-->|           |    |        |
+       |        |    | Queue  |  :   |        |    |           |--->| Memory |
+       |        |    |        |  :   |        |    |           |    |        |
+       +--------+    +--------+  :   +--------+    |           |    |        |
+                                 :                 | Cache     |    +--------+
+                                 :                 | Coherency |
+                                 :                 | Mechanism |    +--------+
+       +--------+    +--------+  :   +--------+    |           |    |        |
+       |        |    |        |  :   |        |    |           |    |        |
+       |  CPU   |    | Memory |  :   | CPU    |    |           |--->| Device |
+       |  Core  |--->| Access |----->| Cache  |<-->|           |    |        |
+       |        |    | Queue  |  :   |        |    |           |    |        |
+       |        |    |        |  :   |        |    |           |    +--------+
+       +--------+    +--------+  :   +--------+    +-----------+
+                                 :
+                                 :
+
+특정 로드나 스토어는 해당 오퍼레이션을 요청한 CPU 의 캐시 내에서 동작을 완료할
+수도 있기 때문에 해당 CPU 의 바깥에는 보이지 않을 수 있지만, 다른 CPU 가 관심을
+갖는다면 캐시 일관성 메커니즘이 해당 캐시라인을 해당 CPU 에게 전달하고, 해당
+메모리 영역에 대한 오퍼레이션이 발생할 때마다 그 영향을 전파시키기 때문에, 해당
+오퍼레이션은 메모리에 실제로 액세스를 한것처럼 나타날 것입니다.
+
+CPU 코어는 프로그램의 인과성이 유지된다고만 여겨진다면 인스트럭션들을 어떤
+순서로든 재배치해서 수행할 수 있습니다.  일부 인스트럭션들은 로드나 스토어
+오퍼레이션을 만드는데 이 오퍼레이션들은 이후 수행될 메모리 액세스 큐에 들어가게
+됩니다.  코어는 이 오퍼레이션들을 해당 큐에 어떤 순서로든 원하는대로 넣을 수
+있고, 다른 인스트럭션의 완료를 기다리도록 강제되기 전까지는 수행을 계속합니다.
+
+메모리 배리어가 하는 일은 CPU 쪽에서 메모리 쪽으로 넘어가는 액세스들의 순서,
+그리고 그 액세스의 결과가 시스템의 다른 관찰자들에게 인지되는 순서를 제어하는
+것입니다.
+
+[!] CPU 들은 항상 그들 자신의 로드와 스토어는 프로그램 순서대로 일어난 것으로
+보기 때문에, 주어진 CPU 내에서는 메모리 배리어를 사용할 필요가 _없습니다_.
+
+[!] MMIO 나 다른 디바이스 액세스들은 캐시 시스템을 우회할 수도 있습니다.  우회
+여부는 디바이스가 액세스 되는 메모리 윈도우의 특성에 의해 결정될 수도 있고, CPU
+가 가지고 있을 수 있는 특수한 디바이스 통신 인스트럭션의 사용에 의해서 결정될
+수도 있습니다.
+
+
+캐시 일관성
+-----------
+
+하지만 삶은 앞에서 이야기한 것처럼 단순하지 않습니다: 캐시들은 일관적일 것으로
+기대되지만, 그 일관성이 순서에도 적용될 거라는 보장은 없습니다.  한 CPU 에서
+만들어진 변경 사항은 최종적으로는 시스템의 모든 CPU 에게 보여지게 되지만, 다른
+CPU 들에게도 같은 순서로 보이게 될 거라는 보장은 없다는 뜻입니다.
+
+
+두개의 CPU (1 & 2) 가 달려 있고, 각 CPU 에 두개의 데이터 캐시(CPU 1 은 A/B 를,
+CPU 2 는 C/D 를 갖습니다)가 병렬로 연결되어 있는 시스템을 다룬다고 생각해
+봅시다:
+
+                   :
+                   :                          +--------+
+                   :      +---------+         |        |
+       +--------+  : +--->| Cache A |<------->|        |
+       |        |  : |    +---------+         |        |
+       |  CPU 1 |<---+                        |        |
+       |        |  : |    +---------+         |        |
+       +--------+  : +--->| Cache B |<------->|        |
+                   :      +---------+         |        |
+                   :                          | Memory |
+                   :      +---------+         | System |
+       +--------+  : +--->| Cache C |<------->|        |
+       |        |  : |    +---------+         |        |
+       |  CPU 2 |<---+                        |        |
+       |        |  : |    +---------+         |        |
+       +--------+  : +--->| Cache D |<------->|        |
+                   :      +---------+         |        |
+                   :                          +--------+
+                   :
+
+이 시스템이 다음과 같은 특성을 갖는다 생각해 봅시다:
+
+ (*) 홀수번 캐시라인은 캐시 A, 캐시 C 또는 메모리에 위치할 수 있음;
+
+ (*) 짝수번 캐시라인은 캐시 B, 캐시 D 또는 메모리에 위치할 수 있음;
+
+ (*) CPU 코어가 한개의 캐시에 접근하는 동안, 다른 캐시는 - 더티 캐시라인을
+     메모리에 내리거나 추측성 로드를 하거나 하기 위해 - 시스템의 다른 부분에
+     액세스 하기 위해 버스를 사용할 수 있음;
+
+ (*) 각 캐시는 시스템의 나머지 부분들과 일관성을 맞추기 위해 해당 캐시에
+     적용되어야 할 오퍼레이션들의 큐를 가짐;
+
+ (*) 이 일관성 큐는 캐시에 이미 존재하는 라인에 가해지는 평범한 로드에 의해서는
+     비워지지 않는데, 큐의 오퍼레이션들이 이 로드의 결과에 영향을 끼칠 수 있다
+     할지라도 그러함.
+
+이제, 첫번째 CPU 에서 두개의 쓰기 오퍼레이션을 만드는데, 해당 CPU 의 캐시에
+요청된 순서로 오퍼레이션이 도달됨을 보장하기 위해 두 오퍼레이션 사이에 쓰기
+배리어를 사용하는 상황을 상상해 봅시다:
+
+       CPU 1           CPU 2           COMMENT
+       =============== =============== =======================================
+                                       u == 0, v == 1 and p == &u, q == &u
+       v = 2;
+       smp_wmb();                      v 의 변경이 p 의 변경 전에 보일 것을
+                                        분명히 함
+       <A:modify v=2>                  v 는 이제 캐시 A 에 독점적으로 존재함
+       p = &v;
+       <B:modify p=&v>                 p 는 이제 캐시 B 에 독점적으로 존재함
+
+여기서의 쓰기 메모리 배리어는 CPU 1 의 캐시가 올바른 순서로 업데이트 된 것으로
+시스템의 다른 CPU 들이 인지하게 만듭니다.  하지만, 이제 두번째 CPU 가 그 값들을
+읽으려 하는 상황을 생각해 봅시다:
+
+       CPU 1           CPU 2           COMMENT
+       =============== =============== =======================================
+       ...
+                       q = p;
+                       x = *q;
+
+위의 두개의 읽기 오퍼레이션은 예상된 순서로 일어나지 못할 수 있는데, 두번째 CPU
+의 한 캐시에 다른 캐시 이벤트가 발생해 v 를 담고 있는 캐시라인의 해당 캐시에의
+업데이트가 지연되는 사이, p 를 담고 있는 캐시라인은 두번째 CPU 의 다른 캐시에
+업데이트 되어버렸을 수 있기 때문입니다.
+
+       CPU 1           CPU 2           COMMENT
+       =============== =============== =======================================
+                                       u == 0, v == 1 and p == &u, q == &u
+       v = 2;
+       smp_wmb();
+       <A:modify v=2>  <C:busy>
+                       <C:queue v=2>
+       p = &v;         q = p;
+                       <D:request p>
+       <B:modify p=&v> <D:commit p=&v>
+                       <D:read p>
+                       x = *q;
+                       <C:read *q>     캐시에 업데이트 되기 전의 v 를 읽음
+                       <C:unbusy>
+                       <C:commit v=2>
+
+기본적으로, 두개의 캐시라인 모두 CPU 2 에 최종적으로는 업데이트 될 것이지만,
+별도의 개입 없이는, 업데이트의 순서가 CPU 1 에서 만들어진 순서와 동일할
+것이라는 보장이 없습니다.
+
+
+여기에 개입하기 위해선, 데이터 의존성 배리어나 읽기 배리어를 로드 오퍼레이션들
+사이에 넣어야 합니다.  이렇게 함으로써 캐시가 다음 요청을 처리하기 전에 일관성
+큐를 처리하도록 강제하게 됩니다.
+
+       CPU 1           CPU 2           COMMENT
+       =============== =============== =======================================
+                                       u == 0, v == 1 and p == &u, q == &u
+       v = 2;
+       smp_wmb();
+       <A:modify v=2>  <C:busy>
+                       <C:queue v=2>
+       p = &v;         q = p;
+                       <D:request p>
+       <B:modify p=&v> <D:commit p=&v>
+                       <D:read p>
+                       smp_read_barrier_depends()
+                       <C:unbusy>
+                       <C:commit v=2>
+                       x = *q;
+                       <C:read *q>     캐시에 업데이트 된 v 를 읽음
+
+
+이런 부류의 문제는 DEC Alpha 계열 프로세서들에서 발견될 수 있는데, 이들은
+데이터 버스를 좀 더 잘 사용해 성능을 개선할 수 있는, 분할된 캐시를 가지고 있기
+때문입니다.  대부분의 CPU 는 하나의 읽기 오퍼레이션의 메모리 액세스가 다른 읽기
+오퍼레이션에 의존적이라면 데이터 의존성 배리어를 내포시킵니다만, 모두가 그런건
+아니기 때문에 이점에 의존해선 안됩니다.
+
+다른 CPU 들도 분할된 캐시를 가지고 있을 수 있지만, 그런 CPU 들은 평범한 메모리
+액세스를 위해서도 이 분할된 캐시들 사이의 조정을 해야만 합니다.  Alpha 는 가장
+약한 메모리 순서 시맨틱 (semantic) 을 선택함으로써 메모리 배리어가 명시적으로
+사용되지 않았을 때에는 그런 조정이 필요하지 않게 했습니다.
+
+
+캐시 일관성 VS DMA
+------------------
+
+모든 시스템이 DMA 를 하는 디바이스에 대해서까지 캐시 일관성을 유지하지는
+않습니다.  그런 경우, DMA 를 시도하는 디바이스는 RAM 으로부터 잘못된 데이터를
+읽을 수 있는데, 더티 캐시 라인이 CPU 의 캐시에 머무르고 있고, 바뀐 값이 아직
+RAM 에 써지지 않았을 수 있기 때문입니다.  이 문제를 해결하기 위해선, 커널의
+적절한 부분에서 각 CPU 캐시의 문제되는 비트들을 플러시 (flush) 시켜야만 합니다
+(그리고 그것들을 무효화 - invalidation - 시킬 수도 있겠죠).
+
+또한, 디바이스에 의해 RAM 에 DMA 로 쓰여진 값은 디바이스가 쓰기를 완료한 후에
+CPU 의 캐시에서 RAM 으로 쓰여지는 더티 캐시 라인에 의해 덮어써질 수도 있고, CPU
+의 캐시에 존재하는 캐시 라인이 해당 캐시에서 삭제되고 다시 값을 읽어들이기
+전까지는 RAM 이 업데이트 되었다는 사실 자체가 숨겨져 버릴 수도 있습니다.  이
+문제를 해결하기 위해선, 커널의 적절한 부분에서 각 CPU 의 캐시 안의 문제가 되는
+비트들을 무효화 시켜야 합니다.
+
+캐시 관리에 대한 더 많은 정보를 위해선 Documentation/cachetlb.txt 를
+참고하세요.
+
+
+캐시 일관성 VS MMIO
+-------------------
+
+Memory mapped I/O 는 일반적으로 CPU 의 메모리 공간 내의 한 윈도우의 특정 부분
+내의 메모리 지역에 이루어지는데, 이 윈도우는 일반적인, RAM 으로 향하는
+윈도우와는 다른 특성을 갖습니다.
+
+그런 특성 가운데 하나는, 일반적으로 그런 액세스는 캐시를 완전히 우회하고
+디바이스 버스로 곧바로 향한다는 것입니다.  이 말은 MMIO 액세스는 먼저
+시작되어서 캐시에서 완료된 메모리 액세스를 추월할 수 있다는 뜻입니다.  이런
+경우엔 메모리 배리어만으로는 충분치 않고, 만약 캐시된 메모리 쓰기 오퍼레이션과
+MMIO 액세스가 어떤 방식으로든 의존적이라면 해당 캐시는 두 오퍼레이션 사이에
+비워져(flush)야만 합니다.
+
+
+======================
+CPU 들이 저지르는 일들
+======================
+
+프로그래머는 CPU 가 메모리 오퍼레이션들을 정확히 요청한대로 수행해 줄 것이라고
+생각하는데, 예를 들어 다음과 같은 코드를 CPU 에게 넘긴다면:
+
+       a = READ_ONCE(*A);
+       WRITE_ONCE(*B, b);
+       c = READ_ONCE(*C);
+       d = READ_ONCE(*D);
+       WRITE_ONCE(*E, e);
+
+CPU 는 다음 인스트럭션을 처리하기 전에 현재의 인스트럭션을 위한 메모리
+오퍼레이션을 완료할 것이라 생각하고, 따라서 시스템 외부에서 관찰하기에도 정해진
+순서대로 오퍼레이션이 수행될 것으로 예상합니다:
+
+       LOAD *A, STORE *B, LOAD *C, LOAD *D, STORE *E.
+
+
+당연하지만, 실제로는 훨씬 엉망입니다.  많은 CPU 와 컴파일러에서 앞의 가정은
+성립하지 못하는데 그 이유는 다음과 같습니다:
+
+ (*) 로드 오퍼레이션들은 실행을 계속 해나가기 위해 곧바로 완료될 필요가 있는
+     경우가 많은 반면, 스토어 오퍼레이션들은 종종 별다른 문제 없이 유예될 수
+     있습니다;
+
+ (*) 로드 오퍼레이션들은 예측적으로 수행될 수 있으며, 필요없는 로드였다고
+     증명된 예측적 로드의 결과는 버려집니다;
+
+ (*) 로드 오퍼레이션들은 예측적으로 수행될 수 있으므로, 예상된 이벤트의
+     시퀀스와 다른 시간에 로드가 이뤄질 수 있습니다;
+
+ (*) 메모리 액세스 순서는 CPU 버스와 캐시를 좀 더 잘 사용할 수 있도록 재배치
+     될 수 있습니다;
+
+ (*) 로드와 스토어는 인접한 위치에의 액세스들을 일괄적으로 처리할 수 있는
+     메모리나 I/O 하드웨어 (메모리와 PCI 디바이스 둘 다 이게 가능할 수
+     있습니다) 에 대해 요청되는 경우, 개별 오퍼레이션을 위한 트랜잭션 설정
+     비용을 아끼기 위해 조합되어 실행될 수 있습니다; 그리고
+
+ (*) 해당 CPU 의 데이터 캐시가 순서에 영향을 끼칠 수도 있고, 캐시 일관성
+     메커니즘이 - 스토어가 실제로 캐시에 도달한다면 - 이 문제를 완화시킬 수는
+     있지만 이 일관성 관리가 다른 CPU 들에도 같은 순서로 전달된다는 보장은
+     없습니다.
+
+따라서, 앞의 코드에 대해 다른 CPU 가 보는 결과는 다음과 같을 수 있습니다:
+
+       LOAD *A, ..., LOAD {*C,*D}, STORE *E, STORE *B
+
+       ("LOAD {*C,*D}" 는 조합된 로드입니다)
+
+
+하지만, CPU 는 스스로는 일관적일 것을 보장합니다: CPU _자신_ 의 액세스들은
+자신에게는 메모리 배리어가 없음에도 불구하고 정확히 순서 세워진 것으로 보여질
+것입니다.  예를 들어 다음의 코드가 주어졌다면:
+
+       U = READ_ONCE(*A);
+       WRITE_ONCE(*A, V);
+       WRITE_ONCE(*A, W);
+       X = READ_ONCE(*A);
+       WRITE_ONCE(*A, Y);
+       Z = READ_ONCE(*A);
+
+그리고 외부의 영향에 의한 간섭이 없다고 가정하면, 최종 결과는 다음과 같이
+나타날 것이라고 예상될 수 있습니다:
+
+       U == *A 의 최초 값
+       X == W
+       Z == Y
+       *A == Y
+
+앞의 코드는 CPU 가 다음의 메모리 액세스 시퀀스를 만들도록 할겁니다:
+
+       U=LOAD *A, STORE *A=V, STORE *A=W, X=LOAD *A, STORE *A=Y, Z=LOAD *A
+
+하지만, 별다른 개입이 없고 프로그램의 시야에 이 세상이 여전히 일관적이라고
+보인다는 보장만 지켜진다면 이 시퀀스는 어떤 조합으로든 재구성될 수 있으며, 각
+액세스들은 합쳐지거나 버려질 수 있습니다.  일부 아키텍쳐에서 CPU 는 같은 위치에
+대한 연속적인 로드 오퍼레이션들을 재배치 할 수 있기 때문에 앞의 예에서의
+READ_ONCE() 와 WRITE_ONCE() 는 반드시 존재해야 함을 알아두세요.  그런 종류의
+아키텍쳐에서 READ_ONCE() 와 WRITE_ONCE() 는 이 문제를 막기 위해 필요한 일을
+뭐가 됐든지 하게 되는데, 예를 들어 Itanium 에서는 READ_ONCE() 와 WRITE_ONCE()
+가 사용하는 volatile 캐스팅은 GCC 가 그런 재배치를 방지하는 특수 인스트럭션인
+ld.acq 와 stl.rel 인스트럭션을 각각 만들어 내도록 합니다.
+
+컴파일러 역시 이 시퀀스의 액세스들을 CPU 가 보기도 전에 합치거나 버리거나 뒤로
+미뤄버릴 수 있습니다.
+
+예를 들어:
+
+       *A = V;
+       *A = W;
+
+는 다음과 같이 변형될 수 있습니다:
+
+       *A = W;
+
+따라서, 쓰기 배리어나 WRITE_ONCE() 가 없다면 *A 로의 V 값의 저장의 효과는
+사라진다고 가정될 수 있습니다.  비슷하게:
+
+       *A = Y;
+       Z = *A;
+
+는, 메모리 배리어나 READ_ONCE() 와 WRITE_ONCE() 없이는 다음과 같이 변형될 수
+있습니다:
+
+       *A = Y;
+       Z = Y;
+
+그리고 이 LOAD 오퍼레이션은 CPU 바깥에는 아예 보이지 않습니다.
+
+
+그리고, ALPHA 가 있다
+---------------------
+
+DEC Alpha CPU 는 가장 완화된 메모리 순서의 CPU 중 하나입니다.  뿐만 아니라,
+Alpha CPU 의 일부 버전은 분할된 데이터 캐시를 가지고 있어서, 의미적으로
+관계되어 있는 두개의 캐시 라인이 서로 다른 시간에 업데이트 되는게 가능합니다.
+이게 데이터 의존성 배리어가 정말 필요해지는 부분인데, 데이터 의존성 배리어는
+메모리 일관성 시스템과 함께 두개의 캐시를 동기화 시켜서, 포인터 변경과 새로운
+데이터의 발견을 올바른 순서로 일어나게 하기 때문입니다.
+
+리눅스 커널의 메모리 배리어 모델은 Alpha 에 기초해서 정의되었습니다.
+
+위의 "캐시 일관성" 서브섹션을 참고하세요.
+
+
+가상 머신 게스트
+----------------
+
+가상 머신에서 동작하는 게스트들은 게스트 자체는 SMP 지원 없이 컴파일 되었다
+해도 SMP 영향을 받을 수 있습니다.  이건 UP 커널을 사용하면서 SMP 호스트와
+결부되어 발생하는 부작용입니다.  이 경우에는 mandatory 배리어를 사용해서 문제를
+해결할 수 있겠지만 그런 해결은 대부분의 경우 최적의 해결책이 아닙니다.
+
+이 문제를 완벽하게 해결하기 위해, 로우 레벨의 virt_mb() 등의 매크로를 사용할 수
+있습니다. 이것들은 SMP 가 활성화 되어 있다면 smp_mb() 등과 동일한 효과를
+갖습니다만, SMP 와 SMP 아닌 시스템 모두에 대해 동일한 코드를 만들어냅니다.
+예를 들어, 가상 머신 게스트들은 (SMP 일 수 있는) 호스트와 동기화를 할 때에는
+smp_mb() 가 아니라 virt_mb() 를 사용해야 합니다.
+
+이것들은 smp_mb() 류의 것들과 모든 부분에서 동일하며, 특히, MMIO 의 영향에
+대해서는 간여하지 않습니다: MMIO 의 영향을 제어하려면, mandatory 배리어를
+사용하시기 바랍니다.
+
+
+=======
+사용 예
+=======
+
+순환식 버퍼
+-----------
+
+메모리 배리어는 순환식 버퍼를 생성자(producer)와 소비자(consumer) 사이의
+동기화에 락을 사용하지 않고 구현하는데에 사용될 수 있습니다.  더 자세한 내용을
+위해선 다음을 참고하세요:
+
+       Documentation/circular-buffers.txt
+
+
+=========
+참고 문헌
+=========
+
+Alpha AXP Architecture Reference Manual, Second Edition (Sites & Witek,
+Digital Press)
+       Chapter 5.2: Physical Address Space Characteristics
+       Chapter 5.4: Caches and Write Buffers
+       Chapter 5.5: Data Sharing
+       Chapter 5.6: Read/Write Ordering
+
+AMD64 Architecture Programmer's Manual Volume 2: System Programming
+       Chapter 7.1: Memory-Access Ordering
+       Chapter 7.4: Buffering and Combining Memory Writes
+
+IA-32 Intel Architecture Software Developer's Manual, Volume 3:
+System Programming Guide
+       Chapter 7.1: Locked Atomic Operations
+       Chapter 7.2: Memory Ordering
+       Chapter 7.4: Serializing Instructions
+
+The SPARC Architecture Manual, Version 9
+       Chapter 8: Memory Models
+       Appendix D: Formal Specification of the Memory Models
+       Appendix J: Programming with the Memory Models
+
+UltraSPARC Programmer Reference Manual
+       Chapter 5: Memory Accesses and Cacheability
+       Chapter 15: Sparc-V9 Memory Models
+
+UltraSPARC III Cu User's Manual
+       Chapter 9: Memory Models
+
+UltraSPARC IIIi Processor User's Manual
+       Chapter 8: Memory Models
+
+UltraSPARC Architecture 2005
+       Chapter 9: Memory
+       Appendix D: Formal Specifications of the Memory Models
+
+UltraSPARC T1 Supplement to the UltraSPARC Architecture 2005
+       Chapter 8: Memory Models
+       Appendix F: Caches and Cache Coherency
+
+Solaris Internals, Core Kernel Architecture, p63-68:
+       Chapter 3.3: Hardware Considerations for Locks and
+                       Synchronization
+
+Unix Systems for Modern Architectures, Symmetric Multiprocessing and Caching
+for Kernel Programmers:
+       Chapter 13: Other Memory Models
+
+Intel Itanium Architecture Software Developer's Manual: Volume 1:
+       Section 2.6: Speculation
+       Section 4.4: Memory Access
index a4d0a99de04da76d62bc0ceebae88fd5283473e4..ba818ecce6f99508f1136a0eb0d3ee07eee74715 100644 (file)
@@ -609,7 +609,7 @@ A data-dependency barrier must also order against dependent writes:
 The data-dependency barrier must order the read into Q with the store
 into *Q.  This prohibits this outcome:
 
-       (Q == B) && (B == 4)
+       (Q == &B) && (B == 4)
 
 Please note that this pattern should be rare.  After all, the whole point
 of dependency ordering is to -prevent- writes to the data structure, along
@@ -1928,6 +1928,7 @@ There are some more advanced barrier functions:
 
      See Documentation/DMA-API.txt for more information on consistent memory.
 
+
 MMIO WRITE BARRIER
 ------------------
 
@@ -2075,7 +2076,7 @@ systems, and so cannot be counted on in such a situation to actually achieve
 anything at all - especially with respect to I/O accesses - unless combined
 with interrupt disabling operations.
 
-See also the section on "Inter-CPU locking barrier effects".
+See also the section on "Inter-CPU acquiring barrier effects".
 
 
 As an example, consider the following:
index 53a2fe1ae8b8343478a1c3d7c11cba7f3572b15d..8e37b0ba2c9dd2f6de7e77e31fde58750693f748 100644 (file)
@@ -16,6 +16,7 @@ CONTENTS
    4.1 System-wide settings
    4.2 Task interface
    4.3 Default behavior
+   4.4 Behavior of sched_yield()
  5. Tasks CPU affinity
    5.1 SCHED_DEADLINE and cpusets HOWTO
  6. Future plans
@@ -426,6 +427,23 @@ CONTENTS
  Finally, notice that in order not to jeopardize the admission control a
  -deadline task cannot fork.
 
+
+4.4 Behavior of sched_yield()
+-----------------------------
+
+ When a SCHED_DEADLINE task calls sched_yield(), it gives up its
+ remaining runtime and is immediately throttled, until the next
+ period, when its runtime will be replenished (a special flag
+ dl_yielded is set and used to handle correctly throttling and runtime
+ replenishment after a call to sched_yield()).
+
+ This behavior of sched_yield() allows the task to wake-up exactly at
+ the beginning of the next period. Also, this may be useful in the
+ future with bandwidth reclaiming mechanisms, where sched_yield() will
+ make the leftoever runtime available for reclamation by other
+ SCHED_DEADLINE tasks.
+
+
 5. Tasks CPU affinity
 =====================
 
index dd5f916b351d1cdd7f11162cb497530973c53940..a273dd0bbaaa52a86691c84ae64e787a30d0c9b0 100644 (file)
@@ -203,6 +203,17 @@ along to ftrace_push_return_trace() instead of a stub value of 0.
 
 Similarly, when you call ftrace_return_to_handler(), pass it the frame pointer.
 
+HAVE_FUNCTION_GRAPH_RET_ADDR_PTR
+--------------------------------
+
+An arch may pass in a pointer to the return address on the stack.  This
+prevents potential stack unwinding issues where the unwinder gets out of
+sync with ret_stack and the wrong addresses are reported by
+ftrace_graph_ret_addr().
+
+Adding support for it is easy: just define the macro in asm/ftrace.h and
+pass the return address pointer as the 'retp' argument to
+ftrace_push_return_trace().
 
 HAVE_FTRACE_NMI_ENTER
 ---------------------
index ea52ec1f8484353b904431a59f4f2e6cedce5830..e4991fb1eedcd4efcd258a7b1290fd087e5f9923 100644 (file)
@@ -44,8 +44,8 @@ Synopsis of kprobe_events
   +|-offs(FETCHARG) : Fetch memory at FETCHARG +|- offs address.(**)
   NAME=FETCHARG : Set NAME as the argument name of FETCHARG.
   FETCHARG:TYPE : Set TYPE as the type of FETCHARG. Currently, basic types
-                 (u8/u16/u32/u64/s8/s16/s32/s64), "string" and bitfield
-                 are supported.
+                 (u8/u16/u32/u64/s8/s16/s32/s64), hexadecimal types
+                 (x8/x16/x32/x64), "string" and bitfield are supported.
 
   (*) only for return probe.
   (**) this is useful for fetching a field of data structures.
@@ -54,7 +54,10 @@ Types
 -----
 Several types are supported for fetch-args. Kprobe tracer will access memory
 by given type. Prefix 's' and 'u' means those types are signed and unsigned
-respectively. Traced arguments are shown in decimal (signed) or hex (unsigned).
+respectively. 'x' prefix implies it is unsigned. Traced arguments are shown
+in decimal ('s' and 'u') or hexadecimal ('x'). Without type casting, 'x32'
+or 'x64' is used depends on the architecture (e.g. x86-32 uses x32, and
+x86-64 uses x64).
 String type is a special type, which fetches a "null-terminated" string from
 kernel space. This means it will fail and store NULL if the string container
 has been paged out.
index 72d1cd4f7bf3b2a3098f6b4f87fac3a91c859716..94b6b45817635b25b4c68a75cf393f78f2388b21 100644 (file)
@@ -40,8 +40,8 @@ Synopsis of uprobe_tracer
    +|-offs(FETCHARG) : Fetch memory at FETCHARG +|- offs address.(**)
    NAME=FETCHARG     : Set NAME as the argument name of FETCHARG.
    FETCHARG:TYPE     : Set TYPE as the type of FETCHARG. Currently, basic types
-                      (u8/u16/u32/u64/s8/s16/s32/s64), "string" and bitfield
-                      are supported.
+                      (u8/u16/u32/u64/s8/s16/s32/s64), hexadecimal types
+                      (x8/x16/x32/x64), "string" and bitfield are supported.
 
   (*) only for return probe.
   (**) this is useful for fetching a field of data structures.
@@ -50,7 +50,10 @@ Types
 -----
 Several types are supported for fetch-args. Uprobe tracer will access memory
 by given type. Prefix 's' and 'u' means those types are signed and unsigned
-respectively. Traced arguments are shown in decimal (signed) or hex (unsigned).
+respectively. 'x' prefix implies it is unsigned. Traced arguments are shown
+in decimal ('s' and 'u') or hexadecimal ('x'). Without type casting, 'x32'
+or 'x64' is used depends on the architecture (e.g. x86-32 uses x32, and
+x86-64 uses x64).
 String type is a special type, which fetches a "null-terminated" string from
 user space.
 Bitfield is another special type, which takes 3 parameters, bit-width, bit-
index c281ded1ba16e0ec436988d8d37d9bfd202ac871..6da7689601d1997b76557f2112a3055f6eb581e3 100644 (file)
@@ -18,6 +18,68 @@ even though there is theoretically space in the PAE PTEs.  These
 permissions are enforced on data access only and have no effect on
 instruction fetches.
 
+=========================== Syscalls ===========================
+
+There are 2 system calls which directly interact with pkeys:
+
+       int pkey_alloc(unsigned long flags, unsigned long init_access_rights)
+       int pkey_free(int pkey);
+       int pkey_mprotect(unsigned long start, size_t len,
+                         unsigned long prot, int pkey);
+
+Before a pkey can be used, it must first be allocated with
+pkey_alloc().  An application calls the WRPKRU instruction
+directly in order to change access permissions to memory covered
+with a key.  In this example WRPKRU is wrapped by a C function
+called pkey_set().
+
+       int real_prot = PROT_READ|PROT_WRITE;
+       pkey = pkey_alloc(0, PKEY_DENY_WRITE);
+       ptr = mmap(NULL, PAGE_SIZE, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
+       ret = pkey_mprotect(ptr, PAGE_SIZE, real_prot, pkey);
+       ... application runs here
+
+Now, if the application needs to update the data at 'ptr', it can
+gain access, do the update, then remove its write access:
+
+       pkey_set(pkey, 0); // clear PKEY_DENY_WRITE
+       *ptr = foo; // assign something
+       pkey_set(pkey, PKEY_DENY_WRITE); // set PKEY_DENY_WRITE again
+
+Now when it frees the memory, it will also free the pkey since it
+is no longer in use:
+
+       munmap(ptr, PAGE_SIZE);
+       pkey_free(pkey);
+
+=========================== Behavior ===========================
+
+The kernel attempts to make protection keys consistent with the
+behavior of a plain mprotect().  For instance if you do this:
+
+       mprotect(ptr, size, PROT_NONE);
+       something(ptr);
+
+you can expect the same effects with protection keys when doing this:
+
+       pkey = pkey_alloc(0, PKEY_DISABLE_WRITE | PKEY_DISABLE_READ);
+       pkey_mprotect(ptr, size, PROT_READ|PROT_WRITE, pkey);
+       something(ptr);
+
+That should be true whether something() is a direct access to 'ptr'
+like:
+
+       *ptr = foo;
+
+or when the kernel does the access on the application's behalf like
+with a read():
+
+       read(fd, ptr, 1);
+
+The kernel will send a SIGSEGV in both cases, but si_code will be set
+to SEGV_PKERR when violating protection keys versus SEGV_ACCERR when
+the plain mprotect() permissions are violated.
+
 =========================== Config Option ===========================
 
 This config option adds approximately 1.5kb of text. and 50 bytes of
index 11535f6a48fb1df0bc643ed4bd25f8e894b1664d..c64e59f7ed328ebb008787a0e9ee8260a1dc3df6 100644 (file)
@@ -8908,6 +8908,7 @@ S:        Supported
 F:     Documentation/virtual/paravirt_ops.txt
 F:     arch/*/kernel/paravirt*
 F:     arch/*/include/asm/paravirt.h
+F:     include/linux/hypervisor.h
 
 PARIDE DRIVERS FOR PARALLEL PORT IDE DEVICES
 M:     Tim Waugh <tim@cyberelk.net>
index 47a3ab795955052c3cc553b2c25e373be58e2b72..d0b205d34946c8ec046c551150b78ab1d8aa7eef 100644 (file)
@@ -733,4 +733,38 @@ config ARCH_NO_COHERENT_DMA_MMAP
 config CPU_NO_EFFICIENT_FFS
        def_bool n
 
+config HAVE_ARCH_VMAP_STACK
+       def_bool n
+       help
+         An arch should select this symbol if it can support kernel stacks
+         in vmalloc space.  This means:
+
+         - vmalloc space must be large enough to hold many kernel stacks.
+           This may rule out many 32-bit architectures.
+
+         - Stacks in vmalloc space need to work reliably.  For example, if
+           vmap page tables are created on demand, either this mechanism
+           needs to work while the stack points to a virtual address with
+           unpopulated page tables or arch code (switch_to() and switch_mm(),
+           most likely) needs to ensure that the stack's page table entries
+           are populated before running on a possibly unpopulated stack.
+
+         - If the stack overflows into a guard page, something reasonable
+           should happen.  The definition of "reasonable" is flexible, but
+           instantly rebooting without logging anything would be unfriendly.
+
+config VMAP_STACK
+       default y
+       bool "Use a virtually-mapped stack"
+       depends on HAVE_ARCH_VMAP_STACK && !KASAN
+       ---help---
+         Enable this if you want the use virtually-mapped kernel stacks
+         with guard pages.  This causes kernel stack overflows to be
+         caught immediately rather than causing difficult-to-diagnose
+         corruption.
+
+         This is presently incompatible with KASAN because KASAN expects
+         the stack to map directly to the KASAN shadow map using a formula
+         that is incorrect if the stack is in vmalloc space.
+
 source "kernel/gcov/Kconfig"
index fec1947b8dbcdbc444f65254df5f3d4eeec717be..02760f6e6ca443547eb2bfbf248fe41fefc49be7 100644 (file)
@@ -78,4 +78,9 @@
 #define MAP_HUGE_SHIFT 26
 #define MAP_HUGE_MASK  0x3f
 
+#define PKEY_DISABLE_ACCESS    0x1
+#define PKEY_DISABLE_WRITE     0x2
+#define PKEY_ACCESS_MASK       (PKEY_DISABLE_ACCESS |\
+                                PKEY_DISABLE_WRITE)
+
 #endif /* __ALPHA_MMAN_H__ */
index 709ee1d6d4df4813c82ad1bb23dac7868e00f999..3f1759411d51bec32175992a2332ab07ca598a8a 100644 (file)
@@ -218,7 +218,7 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr,
        }
 
        err = ftrace_push_return_trace(old, self_addr, &trace.depth,
-                                      frame_pointer);
+                                      frame_pointer, NULL);
        if (err == -EBUSY) {
                *parent = old;
                return;
index 0f03a8fe23144e777b3ead0a6ea18e038b5d1066..aef02d2af3b50db44476e887291b09c4056bf1c2 100644 (file)
@@ -219,7 +219,7 @@ ENDPROC(ftrace_graph_caller)
  *
  * Run ftrace_return_to_handler() before going back to parent.
  * @fp is checked against the value passed by ftrace_graph_caller()
- * only when CONFIG_HAVE_FUNCTION_GRAPH_FP_TEST is enabled.
+ * only when HAVE_FUNCTION_GRAPH_FP_TEST is enabled.
  */
 ENTRY(return_to_handler)
        save_return_regs
index ebecf9aa33d12da8a564ea0314f59a71b89e64a0..40ad08ac569af76b1f58fa775101ef264c480039 100644 (file)
@@ -138,7 +138,7 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr,
                return;
 
        err = ftrace_push_return_trace(old, self_addr, &trace.depth,
-                                      frame_pointer);
+                                      frame_pointer, NULL);
        if (err == -EBUSY)
                return;
        else
index 28d059540424f5ac07ffae53e556d5d2c6f1393f..3b8bdcbb7da3e3235590ee86d681bd106a1ea8e9 100644 (file)
@@ -169,7 +169,7 @@ ENTRY(_ftrace_graph_caller)
        r0 = sp;        /* unsigned long *parent */
        r1 = [sp];      /* unsigned long self_addr */
 # endif
-# ifdef CONFIG_HAVE_FUNCTION_GRAPH_FP_TEST
+# ifdef HAVE_FUNCTION_GRAPH_FP_TEST
        r2 = fp;        /* unsigned long frame_pointer */
 # endif
        r0 += 16;       /* skip the 4 local regs on stack */
@@ -190,7 +190,7 @@ ENTRY(_return_to_handler)
        [--sp] = r1;
 
        /* get original return address */
-# ifdef CONFIG_HAVE_FUNCTION_GRAPH_FP_TEST
+# ifdef HAVE_FUNCTION_GRAPH_FP_TEST
        r0 = fp;        /* Blackfin is sane, so omit this */
 # endif
        call _ftrace_return_to_handler;
index 095de0fa044d0eae0a698e8f02fae14329b4cfb0..8dad7589b8436a7a63acacd36bcc51f9b94d91d4 100644 (file)
@@ -107,7 +107,7 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr,
                return;
 
        if (ftrace_push_return_trace(*parent, self_addr, &trace.depth,
-                                    frame_pointer) == -EBUSY)
+                                    frame_pointer, NULL) == -EBUSY)
                return;
 
        trace.func = self_addr;
index 29bd59790d6c087939e28f484d303ae30378d21c..c7026429816be5d319462f2e9bdd1e373925f828 100644 (file)
@@ -56,7 +56,7 @@ struct thread_info {
 #define alloc_thread_stack_node(tsk, node)     ((unsigned long *) 0)
 #define task_thread_info(tsk)  ((struct thread_info *) 0)
 #endif
-#define free_thread_stack(ti)  /* nothing */
+#define free_thread_stack(tsk) /* nothing */
 #define task_stack_page(tsk)   ((void *)(tsk))
 
 #define __HAVE_THREAD_FUNCTIONS
index fc7b48a52cd554c74be87e2185f7594c68c20beb..d57563c58a26be43672098d9895d9107c945e8d8 100644 (file)
@@ -63,7 +63,7 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
                return;
        }
 
-       err = ftrace_push_return_trace(old, self_addr, &trace.depth, 0);
+       err = ftrace_push_return_trace(old, self_addr, &trace.depth, 0, NULL);
        if (err == -EBUSY) {
                *parent = old;
                return;
index ccdcfcbb24aa60e2d2f161b61067ad4061caca84..655e2fb5395b5ba4dbfba2df396c9c0c8752eb2e 100644 (file)
 #define MAP_HUGE_SHIFT 26
 #define MAP_HUGE_MASK  0x3f
 
+#define PKEY_DISABLE_ACCESS    0x1
+#define PKEY_DISABLE_WRITE     0x2
+#define PKEY_ACCESS_MASK       (PKEY_DISABLE_ACCESS |\
+                                PKEY_DISABLE_WRITE)
+
 #endif /* _ASM_MMAN_H */
index 937c54bc8ccc499504c561f39c026b2c58417b50..30a3b75e88eb6a3310808d8c743739efa3f98fb7 100644 (file)
@@ -382,8 +382,8 @@ void prepare_ftrace_return(unsigned long *parent_ra_addr, unsigned long self_ra,
        if (unlikely(faulted))
                goto out;
 
-       if (ftrace_push_return_trace(old_parent_ra, self_ra, &trace.depth, fp)
-           == -EBUSY) {
+       if (ftrace_push_return_trace(old_parent_ra, self_ra, &trace.depth, fp,
+                                    NULL) == -EBUSY) {
                *parent_ra_addr = old_parent_ra;
                return;
        }
index f3db7d8eb0c265a875fbcbb1d72f1cfaa058efbd..5979745815a5ebcc2825f21a8b98125a0916d5fa 100644 (file)
@@ -75,4 +75,9 @@
 #define MAP_HUGE_SHIFT 26
 #define MAP_HUGE_MASK  0x3f
 
+#define PKEY_DISABLE_ACCESS    0x1
+#define PKEY_DISABLE_WRITE     0x2
+#define PKEY_ACCESS_MASK       (PKEY_DISABLE_ACCESS |\
+                                PKEY_DISABLE_WRITE)
+
 #endif /* __PARISC_MMAN_H__ */
index a828a0adf52c0b19d14eb12219bf718c3cf37704..5a5506a35395c411f91a61a59ebd50ed3ced38b6 100644 (file)
@@ -48,7 +48,7 @@ static void __hot prepare_ftrace_return(unsigned long *parent,
                return;
 
         if (ftrace_push_return_trace(old, self_addr, &trace.depth,
-                       ) == -EBUSY)
+                                    0, NULL) == -EBUSY)
                 return;
 
        /* activate parisc_return_to_handler() as return point */
index cc52d9795f88d5d6989f89adbf876030429402f1..a95639b8d4ac5d72a7be17d2b1e20e3f5fa874a1 100644 (file)
@@ -593,7 +593,8 @@ unsigned long prepare_ftrace_return(unsigned long parent, unsigned long ip)
        if (!ftrace_graph_entry(&trace))
                goto out;
 
-       if (ftrace_push_return_trace(parent, ip, &trace.depth, 0) == -EBUSY)
+       if (ftrace_push_return_trace(parent, ip, &trace.depth, 0,
+                                    NULL) == -EBUSY)
                goto out;
 
        parent = return_hooker;
index 7d95bc402dba4b9ba411c5018091b262d4cd4cb6..c491f2c8f2b933a7c3b9dec88ddaa32fcd91027e 100644 (file)
@@ -369,44 +369,34 @@ void destroy_context(struct mm_struct *mm)
 }
 
 #ifdef CONFIG_SMP
-
-static int mmu_context_cpu_notify(struct notifier_block *self,
-                                 unsigned long action, void *hcpu)
+static int mmu_ctx_cpu_prepare(unsigned int cpu)
 {
-       unsigned int cpu = (unsigned int)(long)hcpu;
-
        /* We don't touch CPU 0 map, it's allocated at aboot and kept
         * around forever
         */
        if (cpu == boot_cpuid)
-               return NOTIFY_OK;
-
-       switch (action) {
-       case CPU_UP_PREPARE:
-       case CPU_UP_PREPARE_FROZEN:
-               pr_devel("MMU: Allocating stale context map for CPU %d\n", cpu);
-               stale_map[cpu] = kzalloc(CTX_MAP_SIZE, GFP_KERNEL);
-               break;
-#ifdef CONFIG_HOTPLUG_CPU
-       case CPU_UP_CANCELED:
-       case CPU_UP_CANCELED_FROZEN:
-       case CPU_DEAD:
-       case CPU_DEAD_FROZEN:
-               pr_devel("MMU: Freeing stale context map for CPU %d\n", cpu);
-               kfree(stale_map[cpu]);
-               stale_map[cpu] = NULL;
-
-               /* We also clear the cpu_vm_mask bits of CPUs going away */
-               clear_tasks_mm_cpumask(cpu);
-       break;
-#endif /* CONFIG_HOTPLUG_CPU */
-       }
-       return NOTIFY_OK;
+               return 0;
+
+       pr_devel("MMU: Allocating stale context map for CPU %d\n", cpu);
+       stale_map[cpu] = kzalloc(CTX_MAP_SIZE, GFP_KERNEL);
+       return 0;
 }
 
-static struct notifier_block mmu_context_cpu_nb = {
-       .notifier_call  = mmu_context_cpu_notify,
-};
+static int mmu_ctx_cpu_dead(unsigned int cpu)
+{
+#ifdef CONFIG_HOTPLUG_CPU
+       if (cpu == boot_cpuid)
+               return 0;
+
+       pr_devel("MMU: Freeing stale context map for CPU %d\n", cpu);
+       kfree(stale_map[cpu]);
+       stale_map[cpu] = NULL;
+
+       /* We also clear the cpu_vm_mask bits of CPUs going away */
+       clear_tasks_mm_cpumask(cpu);
+#endif
+       return 0;
+}
 
 #endif /* CONFIG_SMP */
 
@@ -469,7 +459,9 @@ void __init mmu_context_init(void)
 #else
        stale_map[boot_cpuid] = memblock_virt_alloc(CTX_MAP_SIZE, 0);
 
-       register_cpu_notifier(&mmu_context_cpu_nb);
+       cpuhp_setup_state_nocalls(CPUHP_POWERPC_MMU_CTX_PREPARE,
+                                 "powerpc/mmu/ctx:prepare",
+                                 mmu_ctx_cpu_prepare, mmu_ctx_cpu_dead);
 #endif
 
        printk(KERN_INFO
index 834868b9fdc99af91b802fa0d51580ca6375e5ae..366e4f510fcf5dc26341493667c19f1229bccaa1 100644 (file)
@@ -852,37 +852,33 @@ static void smp_core99_setup_cpu(int cpu_nr)
 
 #ifdef CONFIG_PPC64
 #ifdef CONFIG_HOTPLUG_CPU
-static int smp_core99_cpu_notify(struct notifier_block *self,
-                                unsigned long action, void *hcpu)
+static unsigned int smp_core99_host_open;
+
+static int smp_core99_cpu_prepare(unsigned int cpu)
 {
        int rc;
 
-       switch(action & ~CPU_TASKS_FROZEN) {
-       case CPU_UP_PREPARE:
-               /* Open i2c bus if it was used for tb sync */
-               if (pmac_tb_clock_chip_host) {
-                       rc = pmac_i2c_open(pmac_tb_clock_chip_host, 1);
-                       if (rc) {
-                               pr_err("Failed to open i2c bus for time sync\n");
-                               return notifier_from_errno(rc);
-                       }
+       /* Open i2c bus if it was used for tb sync */
+       if (pmac_tb_clock_chip_host && !smp_core99_host_open) {
+               rc = pmac_i2c_open(pmac_tb_clock_chip_host, 1);
+               if (rc) {
+                       pr_err("Failed to open i2c bus for time sync\n");
+                       return notifier_from_errno(rc);
                }
-               break;
-       case CPU_ONLINE:
-       case CPU_UP_CANCELED:
-               /* Close i2c bus if it was used for tb sync */
-               if (pmac_tb_clock_chip_host)
-                       pmac_i2c_close(pmac_tb_clock_chip_host);
-               break;
-       default:
-               break;
+               smp_core99_host_open = 1;
        }
-       return NOTIFY_OK;
+       return 0;
 }
 
-static struct notifier_block smp_core99_cpu_nb = {
-       .notifier_call  = smp_core99_cpu_notify,
-};
+static int smp_core99_cpu_online(unsigned int cpu)
+{
+       /* Close i2c bus if it was used for tb sync */
+       if (pmac_tb_clock_chip_host && smp_core99_host_open) {
+               pmac_i2c_close(pmac_tb_clock_chip_host);
+               smp_core99_host_open = 0;
+       }
+       return 0;
+}
 #endif /* CONFIG_HOTPLUG_CPU */
 
 static void __init smp_core99_bringup_done(void)
@@ -902,7 +898,11 @@ static void __init smp_core99_bringup_done(void)
                g5_phy_disable_cpu1();
        }
 #ifdef CONFIG_HOTPLUG_CPU
-       register_cpu_notifier(&smp_core99_cpu_nb);
+       cpuhp_setup_state_nocalls(CPUHP_POWERPC_PMAC_PREPARE,
+                                 "powerpc/pmac:prepare", smp_core99_cpu_prepare,
+                                 NULL);
+       cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "powerpc/pmac:online",
+                                 smp_core99_cpu_online, NULL);
 #endif
 
        if (ppc_md.progress)
index 0f7bfeba6da6a3632f2705ebbaccc065ffd356a2..60a8a4e207edbc080a15b809b3efd97ea245942a 100644 (file)
@@ -209,7 +209,8 @@ unsigned long prepare_ftrace_return(unsigned long parent, unsigned long ip)
        /* Only trace if the calling function expects to. */
        if (!ftrace_graph_entry(&trace))
                goto out;
-       if (ftrace_push_return_trace(parent, ip, &trace.depth, 0) == -EBUSY)
+       if (ftrace_push_return_trace(parent, ip, &trace.depth, 0,
+                                    NULL) == -EBUSY)
                goto out;
        parent = (unsigned long) return_to_handler;
 out:
index 38993e09ef03ef371932707058c57bd15dd2d875..95eccd49672f40ca7f5e10f998c3226581221c29 100644 (file)
@@ -382,7 +382,7 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
                return;
        }
 
-       err = ftrace_push_return_trace(old, self_addr, &trace.depth, 0);
+       err = ftrace_push_return_trace(old, self_addr, &trace.depth, 0, NULL);
        if (err == -EBUSY) {
                __raw_writel(old, parent);
                return;
index 59b09600dd326b8d0e24853d720d18d73837cd69..f5d60f14a0bcf50ed87ba2c5a104c6e15f9f90be 100644 (file)
@@ -56,7 +56,6 @@ config SPARC64
        def_bool 64BIT
        select HAVE_FUNCTION_TRACER
        select HAVE_FUNCTION_GRAPH_TRACER
-       select HAVE_FUNCTION_GRAPH_FP_TEST
        select HAVE_KRETPROBES
        select HAVE_KPROBES
        select HAVE_RCU_TABLE_FREE if SMP
index 3192a8e42fd62c6f244a2b5ba5887da395847abd..62755a339a5932b96ddc42fd2a0eb8c3affb149b 100644 (file)
@@ -9,6 +9,10 @@
 void _mcount(void);
 #endif
 
+#endif /* CONFIG_MCOUNT */
+
+#if defined(CONFIG_SPARC64) && !defined(CC_USE_FENTRY)
+#define HAVE_FUNCTION_GRAPH_FP_TEST
 #endif
 
 #ifdef CONFIG_DYNAMIC_FTRACE
index 0a2d2ddff543fd325709fc46eaf48d438d06b88b..6bcff698069bf1fa57c1d84df180c1cb67f18a65 100644 (file)
@@ -131,7 +131,7 @@ unsigned long prepare_ftrace_return(unsigned long parent,
                return parent + 8UL;
 
        if (ftrace_push_return_trace(parent, self_addr, &trace.depth,
-                                    frame_pointer) == -EBUSY)
+                                    frame_pointer, NULL) == -EBUSY)
                return parent + 8UL;
 
        trace.func = self_addr;
index fb30e7c6a5b15bac54e052010784327f62290589..e80e6ba3d5002159bd5836554de791da494ade1d 100644 (file)
@@ -352,9 +352,7 @@ static void sparc_start_secondary(void *arg)
        preempt_disable();
        cpu = smp_processor_id();
 
-       /* Invoke the CPU_STARTING notifier callbacks */
        notify_cpu_starting(cpu);
-
        arch_cpu_pre_online(arg);
 
        /* Set the CPU in the cpu_online_mask */
index 4a572088b270bec26a0b10cce0453ab2d15e6c20..b827a418b155021104d27de2d821cf81e59babce 100644 (file)
@@ -184,7 +184,7 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr,
        *parent = return_hooker;
 
        err = ftrace_push_return_trace(old, self_addr, &trace.depth,
-                                      frame_pointer);
+                                      frame_pointer, NULL);
        if (err == -EBUSY) {
                *parent = old;
                return;
index 2a1f0ce7c59acac6e1a6543eeda61ccb3792f277..4c3972847c2a8e8052e29da8224da4fc1894441c 100644 (file)
@@ -93,6 +93,7 @@ config X86
        select HAVE_ARCH_TRANSPARENT_HUGEPAGE
        select HAVE_ARCH_WITHIN_STACK_FRAMES
        select HAVE_EBPF_JIT                    if X86_64
+       select HAVE_ARCH_VMAP_STACK             if X86_64
        select HAVE_CC_STACKPROTECTOR
        select HAVE_CMPXCHG_DOUBLE
        select HAVE_CMPXCHG_LOCAL
@@ -109,7 +110,6 @@ config X86
        select HAVE_EXIT_THREAD
        select HAVE_FENTRY                      if X86_64
        select HAVE_FTRACE_MCOUNT_RECORD
-       select HAVE_FUNCTION_GRAPH_FP_TEST
        select HAVE_FUNCTION_GRAPH_TRACER
        select HAVE_FUNCTION_TRACER
        select HAVE_GCC_PLUGINS
index ff574dad95ccaf2739d0edfe85c94acea1e15134..94dd4a31f5b39a88f205fbda2a452f1228cd9516 100644 (file)
@@ -1004,79 +1004,87 @@ static efi_status_t alloc_e820ext(u32 nr_desc, struct setup_data **e820ext,
        return status;
 }
 
-static efi_status_t exit_boot(struct boot_params *boot_params,
-                             void *handle, bool is64)
-{
-       struct efi_info *efi = &boot_params->efi_info;
-       unsigned long map_sz, key, desc_size;
-       efi_memory_desc_t *mem_map;
+struct exit_boot_struct {
+       struct boot_params *boot_params;
+       struct efi_info *efi;
        struct setup_data *e820ext;
-       const char *signature;
        __u32 e820ext_size;
-       __u32 nr_desc, prev_nr_desc;
-       efi_status_t status;
-       __u32 desc_version;
-       bool called_exit = false;
-       u8 nr_entries;
-       int i;
-
-       nr_desc = 0;
-       e820ext = NULL;
-       e820ext_size = 0;
-
-get_map:
-       status = efi_get_memory_map(sys_table, &mem_map, &map_sz, &desc_size,
-                                   &desc_version, &key);
-
-       if (status != EFI_SUCCESS)
-               return status;
-
-       prev_nr_desc = nr_desc;
-       nr_desc = map_sz / desc_size;
-       if (nr_desc > prev_nr_desc &&
-           nr_desc > ARRAY_SIZE(boot_params->e820_map)) {
-               u32 nr_e820ext = nr_desc - ARRAY_SIZE(boot_params->e820_map);
-
-               status = alloc_e820ext(nr_e820ext, &e820ext, &e820ext_size);
-               if (status != EFI_SUCCESS)
-                       goto free_mem_map;
+       bool is64;
+};
 
-               efi_call_early(free_pool, mem_map);
-               goto get_map; /* Allocated memory, get map again */
+static efi_status_t exit_boot_func(efi_system_table_t *sys_table_arg,
+                                  struct efi_boot_memmap *map,
+                                  void *priv)
+{
+       static bool first = true;
+       const char *signature;
+       __u32 nr_desc;
+       efi_status_t status;
+       struct exit_boot_struct *p = priv;
+
+       if (first) {
+               nr_desc = *map->buff_size / *map->desc_size;
+               if (nr_desc > ARRAY_SIZE(p->boot_params->e820_map)) {
+                       u32 nr_e820ext = nr_desc -
+                                       ARRAY_SIZE(p->boot_params->e820_map);
+
+                       status = alloc_e820ext(nr_e820ext, &p->e820ext,
+                                              &p->e820ext_size);
+                       if (status != EFI_SUCCESS)
+                               return status;
+               }
+               first = false;
        }
 
-       signature = is64 ? EFI64_LOADER_SIGNATURE : EFI32_LOADER_SIGNATURE;
-       memcpy(&efi->efi_loader_signature, signature, sizeof(__u32));
+       signature = p->is64 ? EFI64_LOADER_SIGNATURE : EFI32_LOADER_SIGNATURE;
+       memcpy(&p->efi->efi_loader_signature, signature, sizeof(__u32));
 
-       efi->efi_systab = (unsigned long)sys_table;
-       efi->efi_memdesc_size = desc_size;
-       efi->efi_memdesc_version = desc_version;
-       efi->efi_memmap = (unsigned long)mem_map;
-       efi->efi_memmap_size = map_sz;
+       p->efi->efi_systab = (unsigned long)sys_table_arg;
+       p->efi->efi_memdesc_size = *map->desc_size;
+       p->efi->efi_memdesc_version = *map->desc_ver;
+       p->efi->efi_memmap = (unsigned long)*map->map;
+       p->efi->efi_memmap_size = *map->map_size;
 
 #ifdef CONFIG_X86_64
-       efi->efi_systab_hi = (unsigned long)sys_table >> 32;
-       efi->efi_memmap_hi = (unsigned long)mem_map >> 32;
+       p->efi->efi_systab_hi = (unsigned long)sys_table_arg >> 32;
+       p->efi->efi_memmap_hi = (unsigned long)*map->map >> 32;
 #endif
 
+       return EFI_SUCCESS;
+}
+
+static efi_status_t exit_boot(struct boot_params *boot_params,
+                             void *handle, bool is64)
+{
+       unsigned long map_sz, key, desc_size, buff_size;
+       efi_memory_desc_t *mem_map;
+       struct setup_data *e820ext;
+       __u32 e820ext_size;
+       efi_status_t status;
+       __u32 desc_version;
+       struct efi_boot_memmap map;
+       struct exit_boot_struct priv;
+
+       map.map =               &mem_map;
+       map.map_size =          &map_sz;
+       map.desc_size =         &desc_size;
+       map.desc_ver =          &desc_version;
+       map.key_ptr =           &key;
+       map.buff_size =         &buff_size;
+       priv.boot_params =      boot_params;
+       priv.efi =              &boot_params->efi_info;
+       priv.e820ext =          NULL;
+       priv.e820ext_size =     0;
+       priv.is64 =             is64;
+
        /* Might as well exit boot services now */
-       status = efi_call_early(exit_boot_services, handle, key);
-       if (status != EFI_SUCCESS) {
-               /*
-                * ExitBootServices() will fail if any of the event
-                * handlers change the memory map. In which case, we
-                * must be prepared to retry, but only once so that
-                * we're guaranteed to exit on repeated failures instead
-                * of spinning forever.
-                */
-               if (called_exit)
-                       goto free_mem_map;
-
-               called_exit = true;
-               efi_call_early(free_pool, mem_map);
-               goto get_map;
-       }
+       status = efi_exit_boot_services(sys_table, handle, &map, &priv,
+                                       exit_boot_func);
+       if (status != EFI_SUCCESS)
+               return status;
 
+       e820ext = priv.e820ext;
+       e820ext_size = priv.e820ext_size;
        /* Historic? */
        boot_params->alt_mem_k = 32 * 1024;
 
@@ -1085,10 +1093,6 @@ get_map:
                return status;
 
        return EFI_SUCCESS;
-
-free_mem_map:
-       efi_call_early(free_pool, mem_map);
-       return status;
 }
 
 /*
index 64d29a3ad37676b079c6488a350b4de5debdae61..21b352a11b493f4868b1a091dc416c7e2279ad34 100644 (file)
        POP_GS_EX
 .endm
 
+/*
+ * %eax: prev task
+ * %edx: next task
+ */
+ENTRY(__switch_to_asm)
+       /*
+        * Save callee-saved registers
+        * This must match the order in struct inactive_task_frame
+        */
+       pushl   %ebp
+       pushl   %ebx
+       pushl   %edi
+       pushl   %esi
+
+       /* switch stack */
+       movl    %esp, TASK_threadsp(%eax)
+       movl    TASK_threadsp(%edx), %esp
+
+#ifdef CONFIG_CC_STACKPROTECTOR
+       movl    TASK_stack_canary(%edx), %ebx
+       movl    %ebx, PER_CPU_VAR(stack_canary)+stack_canary_offset
+#endif
+
+       /* restore callee-saved registers */
+       popl    %esi
+       popl    %edi
+       popl    %ebx
+       popl    %ebp
+
+       jmp     __switch_to
+END(__switch_to_asm)
+
+/*
+ * A newly forked process directly context switches into this address.
+ *
+ * eax: prev task we switched from
+ * ebx: kernel thread func (NULL for user thread)
+ * edi: kernel thread arg
+ */
 ENTRY(ret_from_fork)
        pushl   %eax
        call    schedule_tail
        popl    %eax
 
+       testl   %ebx, %ebx
+       jnz     1f              /* kernel threads are uncommon */
+
+2:
        /* When we fork, we trace the syscall return in the child, too. */
        movl    %esp, %eax
        call    syscall_return_slowpath
        jmp     restore_all
-END(ret_from_fork)
-
-ENTRY(ret_from_kernel_thread)
-       pushl   %eax
-       call    schedule_tail
-       popl    %eax
-       movl    PT_EBP(%esp), %eax
-       call    *PT_EBX(%esp)
-       movl    $0, PT_EAX(%esp)
 
+       /* kernel thread */
+1:     movl    %edi, %eax
+       call    *%ebx
        /*
-        * Kernel threads return to userspace as if returning from a syscall.
-        * We should check whether anything actually uses this path and, if so,
-        * consider switching it over to ret_from_fork.
+        * A kernel thread is allowed to return here after successfully
+        * calling do_execve().  Exit to userspace to complete the execve()
+        * syscall.
         */
-       movl    %esp, %eax
-       call    syscall_return_slowpath
-       jmp     restore_all
-ENDPROC(ret_from_kernel_thread)
+       movl    $0, PT_EAX(%esp)
+       jmp     2b
+END(ret_from_fork)
 
 /*
  * Return to user mode is not as complex as all this looks,
index aa605263c45b6344e9e7f135d015697dae2b0e3e..b9ca6b34b6c539d338445bd5ce5034b98fa41125 100644 (file)
@@ -352,8 +352,7 @@ ENTRY(stub_ptregs_64)
        jmp     entry_SYSCALL64_slow_path
 
 1:
-       /* Called from C */
-       jmp     *%rax                           /* called from C */
+       jmp     *%rax                           /* Called from C */
 END(stub_ptregs_64)
 
 .macro ptregs_stub func
@@ -369,42 +368,74 @@ END(ptregs_\func)
 #define __SYSCALL_64(nr, sym, qual) __SYSCALL_64_QUAL_##qual(sym)
 #include <asm/syscalls_64.h>
 
+/*
+ * %rdi: prev task
+ * %rsi: next task
+ */
+ENTRY(__switch_to_asm)
+       /*
+        * Save callee-saved registers
+        * This must match the order in inactive_task_frame
+        */
+       pushq   %rbp
+       pushq   %rbx
+       pushq   %r12
+       pushq   %r13
+       pushq   %r14
+       pushq   %r15
+
+       /* switch stack */
+       movq    %rsp, TASK_threadsp(%rdi)
+       movq    TASK_threadsp(%rsi), %rsp
+
+#ifdef CONFIG_CC_STACKPROTECTOR
+       movq    TASK_stack_canary(%rsi), %rbx
+       movq    %rbx, PER_CPU_VAR(irq_stack_union)+stack_canary_offset
+#endif
+
+       /* restore callee-saved registers */
+       popq    %r15
+       popq    %r14
+       popq    %r13
+       popq    %r12
+       popq    %rbx
+       popq    %rbp
+
+       jmp     __switch_to
+END(__switch_to_asm)
+
 /*
  * A newly forked process directly context switches into this address.
  *
- * rdi: prev task we switched from
+ * rax: prev task we switched from
+ * rbx: kernel thread func (NULL for user thread)
+ * r12: kernel thread arg
  */
 ENTRY(ret_from_fork)
-       LOCK ; btr $TIF_FORK, TI_flags(%r8)
-
+       movq    %rax, %rdi
        call    schedule_tail                   /* rdi: 'prev' task parameter */
 
-       testb   $3, CS(%rsp)                    /* from kernel_thread? */
-       jnz     1f
-
-       /*
-        * We came from kernel_thread.  This code path is quite twisted, and
-        * someone should clean it up.
-        *
-        * copy_thread_tls stashes the function pointer in RBX and the
-        * parameter to be passed in RBP.  The called function is permitted
-        * to call do_execve and thereby jump to user mode.
-        */
-       movq    RBP(%rsp), %rdi
-       call    *RBX(%rsp)
-       movl    $0, RAX(%rsp)
-
-       /*
-        * Fall through as though we're exiting a syscall.  This makes a
-        * twisted sort of sense if we just called do_execve.
-        */
+       testq   %rbx, %rbx                      /* from kernel_thread? */
+       jnz     1f                              /* kernel threads are uncommon */
 
-1:
+2:
        movq    %rsp, %rdi
        call    syscall_return_slowpath /* returns with IRQs disabled */
        TRACE_IRQS_ON                   /* user mode is traced as IRQS on */
        SWAPGS
        jmp     restore_regs_and_iret
+
+1:
+       /* kernel thread */
+       movq    %r12, %rdi
+       call    *%rbx
+       /*
+        * A kernel thread is allowed to return here after successfully
+        * calling do_execve().  Exit to userspace to complete the execve()
+        * syscall.
+        */
+       movq    $0, RAX(%rsp)
+       jmp     2b
 END(ret_from_fork)
 
 /*
index f848572169ea481dfe7d1dd33d4d8251018caeea..ff6ef7b3082237e5d223de9a9986202761566d17 100644 (file)
 377    i386    copy_file_range         sys_copy_file_range
 378    i386    preadv2                 sys_preadv2                     compat_sys_preadv2
 379    i386    pwritev2                sys_pwritev2                    compat_sys_pwritev2
+380    i386    pkey_mprotect           sys_pkey_mprotect
+381    i386    pkey_alloc              sys_pkey_alloc
+382    i386    pkey_free               sys_pkey_free
+#383   i386    pkey_get                sys_pkey_get
+#384   i386    pkey_set                sys_pkey_set
index e9ce9c7c39b48ca5e439735c87d6fe5b138d7fbc..2f024d02511da47e12cacfbeea9826c440de8f9e 100644 (file)
 326    common  copy_file_range         sys_copy_file_range
 327    64      preadv2                 sys_preadv2
 328    64      pwritev2                sys_pwritev2
+329    common  pkey_mprotect           sys_pkey_mprotect
+330    common  pkey_alloc              sys_pkey_alloc
+331    common  pkey_free               sys_pkey_free
+#332   common  pkey_get                sys_pkey_get
+#333   common  pkey_set                sys_pkey_set
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
index e6131d4454e67005f91047d9e966243ae86aaf80..65577f081d072c5f774f763adc19a0e7a87e5627 100644 (file)
@@ -29,6 +29,8 @@
 
 #define COUNTER_SHIFT          16
 
+static HLIST_HEAD(uncore_unused_list);
+
 struct amd_uncore {
        int id;
        int refcnt;
@@ -39,7 +41,7 @@ struct amd_uncore {
        cpumask_t *active_mask;
        struct pmu *pmu;
        struct perf_event *events[MAX_COUNTERS];
-       struct amd_uncore *free_when_cpu_online;
+       struct hlist_node node;
 };
 
 static struct amd_uncore * __percpu *amd_uncore_nb;
@@ -306,6 +308,7 @@ static int amd_uncore_cpu_up_prepare(unsigned int cpu)
                uncore_nb->msr_base = MSR_F15H_NB_PERF_CTL;
                uncore_nb->active_mask = &amd_nb_active_mask;
                uncore_nb->pmu = &amd_nb_pmu;
+               uncore_nb->id = -1;
                *per_cpu_ptr(amd_uncore_nb, cpu) = uncore_nb;
        }
 
@@ -319,6 +322,7 @@ static int amd_uncore_cpu_up_prepare(unsigned int cpu)
                uncore_l2->msr_base = MSR_F16H_L2I_PERF_CTL;
                uncore_l2->active_mask = &amd_l2_active_mask;
                uncore_l2->pmu = &amd_l2_pmu;
+               uncore_l2->id = -1;
                *per_cpu_ptr(amd_uncore_l2, cpu) = uncore_l2;
        }
 
@@ -348,7 +352,7 @@ amd_uncore_find_online_sibling(struct amd_uncore *this,
                        continue;
 
                if (this->id == that->id) {
-                       that->free_when_cpu_online = this;
+                       hlist_add_head(&this->node, &uncore_unused_list);
                        this = that;
                        break;
                }
@@ -388,13 +392,23 @@ static int amd_uncore_cpu_starting(unsigned int cpu)
        return 0;
 }
 
+static void uncore_clean_online(void)
+{
+       struct amd_uncore *uncore;
+       struct hlist_node *n;
+
+       hlist_for_each_entry_safe(uncore, n, &uncore_unused_list, node) {
+               hlist_del(&uncore->node);
+               kfree(uncore);
+       }
+}
+
 static void uncore_online(unsigned int cpu,
                          struct amd_uncore * __percpu *uncores)
 {
        struct amd_uncore *uncore = *per_cpu_ptr(uncores, cpu);
 
-       kfree(uncore->free_when_cpu_online);
-       uncore->free_when_cpu_online = NULL;
+       uncore_clean_online();
 
        if (cpu == uncore->cpu)
                cpumask_set_cpu(cpu, uncore->active_mask);
index d0efb5cb1b00ece9656805cc62124454d00cda2d..dcaa887425b3ddc074b0ac481cb0ea08f2054b2a 100644 (file)
@@ -1201,6 +1201,9 @@ static int x86_pmu_add(struct perf_event *event, int flags)
         * If group events scheduling transaction was started,
         * skip the schedulability test here, it will be performed
         * at commit time (->commit_txn) as a whole.
+        *
+        * If commit fails, we'll call ->del() on all events
+        * for which ->add() was called.
         */
        if (cpuc->txn_flags & PERF_PMU_TXN_ADD)
                goto done_collect;
@@ -1223,6 +1226,14 @@ done_collect:
        cpuc->n_added += n - n0;
        cpuc->n_txn += n - n0;
 
+       if (x86_pmu.add) {
+               /*
+                * This is before x86_pmu_enable() will call x86_pmu_start(),
+                * so we enable LBRs before an event needs them etc..
+                */
+               x86_pmu.add(event);
+       }
+
        ret = 0;
 out:
        return ret;
@@ -1346,7 +1357,7 @@ static void x86_pmu_del(struct perf_event *event, int flags)
        event->hw.flags &= ~PERF_X86_EVENT_COMMITTED;
 
        /*
-        * If we're called during a txn, we don't need to do anything.
+        * If we're called during a txn, we only need to undo x86_pmu.add.
         * The events never got scheduled and ->cancel_txn will truncate
         * the event_list.
         *
@@ -1354,7 +1365,7 @@ static void x86_pmu_del(struct perf_event *event, int flags)
         * an event added during that same TXN.
         */
        if (cpuc->txn_flags & PERF_PMU_TXN_ADD)
-               return;
+               goto do_del;
 
        /*
         * Not a TXN, therefore cleanup properly.
@@ -1384,6 +1395,15 @@ static void x86_pmu_del(struct perf_event *event, int flags)
        --cpuc->n_events;
 
        perf_event_update_userpage(event);
+
+do_del:
+       if (x86_pmu.del) {
+               /*
+                * This is after x86_pmu_stop(); so we disable LBRs after any
+                * event can need them etc..
+                */
+               x86_pmu.del(event);
+       }
 }
 
 int x86_pmu_handle_irq(struct pt_regs *regs)
@@ -2277,7 +2297,8 @@ perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *re
                return;
        }
 
-       perf_callchain_store(entry, regs->ip);
+       if (perf_callchain_store(entry, regs->ip))
+               return;
 
        dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry);
 }
index 0a6e393a2e6298bb9240831714d5915ab26abc5a..bdcd6510992c3338c5a3efba35ef3ef7c606bc6a 100644 (file)
 struct bts_ctx {
        struct perf_output_handle       handle;
        struct debug_store              ds_back;
-       int                             started;
+       int                             state;
+};
+
+/* BTS context states: */
+enum {
+       /* no ongoing AUX transactions */
+       BTS_STATE_STOPPED = 0,
+       /* AUX transaction is on, BTS tracing is disabled */
+       BTS_STATE_INACTIVE,
+       /* AUX transaction is on, BTS tracing is running */
+       BTS_STATE_ACTIVE,
 };
 
 static DEFINE_PER_CPU(struct bts_ctx, bts_ctx);
@@ -204,6 +214,15 @@ static void bts_update(struct bts_ctx *bts)
 static int
 bts_buffer_reset(struct bts_buffer *buf, struct perf_output_handle *handle);
 
+/*
+ * Ordering PMU callbacks wrt themselves and the PMI is done by means
+ * of bts::state, which:
+ *  - is set when bts::handle::event is valid, that is, between
+ *    perf_aux_output_begin() and perf_aux_output_end();
+ *  - is zero otherwise;
+ *  - is ordered against bts::handle::event with a compiler barrier.
+ */
+
 static void __bts_event_start(struct perf_event *event)
 {
        struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
@@ -221,10 +240,13 @@ static void __bts_event_start(struct perf_event *event)
 
        /*
         * local barrier to make sure that ds configuration made it
-        * before we enable BTS
+        * before we enable BTS and bts::state goes ACTIVE
         */
        wmb();
 
+       /* INACTIVE/STOPPED -> ACTIVE */
+       WRITE_ONCE(bts->state, BTS_STATE_ACTIVE);
+
        intel_pmu_enable_bts(config);
 
 }
@@ -251,9 +273,6 @@ static void bts_event_start(struct perf_event *event, int flags)
 
        __bts_event_start(event);
 
-       /* PMI handler: this counter is running and likely generating PMIs */
-       ACCESS_ONCE(bts->started) = 1;
-
        return;
 
 fail_end_stop:
@@ -263,30 +282,34 @@ fail_stop:
        event->hw.state = PERF_HES_STOPPED;
 }
 
-static void __bts_event_stop(struct perf_event *event)
+static void __bts_event_stop(struct perf_event *event, int state)
 {
+       struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+
+       /* ACTIVE -> INACTIVE(PMI)/STOPPED(->stop()) */
+       WRITE_ONCE(bts->state, state);
+
        /*
         * No extra synchronization is mandated by the documentation to have
         * BTS data stores globally visible.
         */
        intel_pmu_disable_bts();
-
-       if (event->hw.state & PERF_HES_STOPPED)
-               return;
-
-       ACCESS_ONCE(event->hw.state) |= PERF_HES_STOPPED;
 }
 
 static void bts_event_stop(struct perf_event *event, int flags)
 {
        struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
        struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
-       struct bts_buffer *buf = perf_get_aux(&bts->handle);
+       struct bts_buffer *buf = NULL;
+       int state = READ_ONCE(bts->state);
 
-       /* PMI handler: don't restart this counter */
-       ACCESS_ONCE(bts->started) = 0;
+       if (state == BTS_STATE_ACTIVE)
+               __bts_event_stop(event, BTS_STATE_STOPPED);
 
-       __bts_event_stop(event);
+       if (state != BTS_STATE_STOPPED)
+               buf = perf_get_aux(&bts->handle);
+
+       event->hw.state |= PERF_HES_STOPPED;
 
        if (flags & PERF_EF_UPDATE) {
                bts_update(bts);
@@ -296,6 +319,7 @@ static void bts_event_stop(struct perf_event *event, int flags)
                                bts->handle.head =
                                        local_xchg(&buf->data_size,
                                                   buf->nr_pages << PAGE_SHIFT);
+
                        perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0),
                                            !!local_xchg(&buf->lost, 0));
                }
@@ -310,8 +334,20 @@ static void bts_event_stop(struct perf_event *event, int flags)
 void intel_bts_enable_local(void)
 {
        struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+       int state = READ_ONCE(bts->state);
+
+       /*
+        * Here we transition from INACTIVE to ACTIVE;
+        * if we instead are STOPPED from the interrupt handler,
+        * stay that way. Can't be ACTIVE here though.
+        */
+       if (WARN_ON_ONCE(state == BTS_STATE_ACTIVE))
+               return;
+
+       if (state == BTS_STATE_STOPPED)
+               return;
 
-       if (bts->handle.event && bts->started)
+       if (bts->handle.event)
                __bts_event_start(bts->handle.event);
 }
 
@@ -319,8 +355,15 @@ void intel_bts_disable_local(void)
 {
        struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
 
+       /*
+        * Here we transition from ACTIVE to INACTIVE;
+        * do nothing for STOPPED or INACTIVE.
+        */
+       if (READ_ONCE(bts->state) != BTS_STATE_ACTIVE)
+               return;
+
        if (bts->handle.event)
-               __bts_event_stop(bts->handle.event);
+               __bts_event_stop(bts->handle.event, BTS_STATE_INACTIVE);
 }
 
 static int
@@ -335,8 +378,6 @@ bts_buffer_reset(struct bts_buffer *buf, struct perf_output_handle *handle)
                return 0;
 
        head = handle->head & ((buf->nr_pages << PAGE_SHIFT) - 1);
-       if (WARN_ON_ONCE(head != local_read(&buf->head)))
-               return -EINVAL;
 
        phys = &buf->buf[buf->cur_buf];
        space = phys->offset + phys->displacement + phys->size - head;
@@ -403,22 +444,37 @@ bts_buffer_reset(struct bts_buffer *buf, struct perf_output_handle *handle)
 
 int intel_bts_interrupt(void)
 {
+       struct debug_store *ds = this_cpu_ptr(&cpu_hw_events)->ds;
        struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
        struct perf_event *event = bts->handle.event;
        struct bts_buffer *buf;
        s64 old_head;
-       int err;
+       int err = -ENOSPC, handled = 0;
 
-       if (!event || !bts->started)
-               return 0;
+       /*
+        * The only surefire way of knowing if this NMI is ours is by checking
+        * the write ptr against the PMI threshold.
+        */
+       if (ds->bts_index >= ds->bts_interrupt_threshold)
+               handled = 1;
+
+       /*
+        * this is wrapped in intel_bts_enable_local/intel_bts_disable_local,
+        * so we can only be INACTIVE or STOPPED
+        */
+       if (READ_ONCE(bts->state) == BTS_STATE_STOPPED)
+               return handled;
 
        buf = perf_get_aux(&bts->handle);
+       if (!buf)
+               return handled;
+
        /*
         * Skip snapshot counters: they don't use the interrupt, but
         * there's no other way of telling, because the pointer will
         * keep moving
         */
-       if (!buf || buf->snapshot)
+       if (buf->snapshot)
                return 0;
 
        old_head = local_read(&buf->head);
@@ -426,18 +482,27 @@ int intel_bts_interrupt(void)
 
        /* no new data */
        if (old_head == local_read(&buf->head))
-               return 0;
+               return handled;
 
        perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0),
                            !!local_xchg(&buf->lost, 0));
 
        buf = perf_aux_output_begin(&bts->handle, event);
-       if (!buf)
-               return 1;
+       if (buf)
+               err = bts_buffer_reset(buf, &bts->handle);
+
+       if (err) {
+               WRITE_ONCE(bts->state, BTS_STATE_STOPPED);
 
-       err = bts_buffer_reset(buf, &bts->handle);
-       if (err)
-               perf_aux_output_end(&bts->handle, 0, false);
+               if (buf) {
+                       /*
+                        * BTS_STATE_STOPPED should be visible before
+                        * cleared handle::event
+                        */
+                       barrier();
+                       perf_aux_output_end(&bts->handle, 0, false);
+               }
+       }
 
        return 1;
 }
index 2cbde2f449aa8ced63adf14b14f9ceb3d464068c..88792f846d12bb37ac9194f9d48894206bb625ab 100644 (file)
@@ -1907,13 +1907,6 @@ static void intel_pmu_disable_event(struct perf_event *event)
        cpuc->intel_ctrl_host_mask &= ~(1ull << hwc->idx);
        cpuc->intel_cp_status &= ~(1ull << hwc->idx);
 
-       /*
-        * must disable before any actual event
-        * because any event may be combined with LBR
-        */
-       if (needs_branch_stack(event))
-               intel_pmu_lbr_disable(event);
-
        if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
                intel_pmu_disable_fixed(hwc);
                return;
@@ -1925,6 +1918,14 @@ static void intel_pmu_disable_event(struct perf_event *event)
                intel_pmu_pebs_disable(event);
 }
 
+static void intel_pmu_del_event(struct perf_event *event)
+{
+       if (needs_branch_stack(event))
+               intel_pmu_lbr_del(event);
+       if (event->attr.precise_ip)
+               intel_pmu_pebs_del(event);
+}
+
 static void intel_pmu_enable_fixed(struct hw_perf_event *hwc)
 {
        int idx = hwc->idx - INTEL_PMC_IDX_FIXED;
@@ -1968,12 +1969,6 @@ static void intel_pmu_enable_event(struct perf_event *event)
                intel_pmu_enable_bts(hwc->config);
                return;
        }
-       /*
-        * must enabled before any actual event
-        * because any event may be combined with LBR
-        */
-       if (needs_branch_stack(event))
-               intel_pmu_lbr_enable(event);
 
        if (event->attr.exclude_host)
                cpuc->intel_ctrl_guest_mask |= (1ull << hwc->idx);
@@ -1994,6 +1989,14 @@ static void intel_pmu_enable_event(struct perf_event *event)
        __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
 }
 
+static void intel_pmu_add_event(struct perf_event *event)
+{
+       if (event->attr.precise_ip)
+               intel_pmu_pebs_add(event);
+       if (needs_branch_stack(event))
+               intel_pmu_lbr_add(event);
+}
+
 /*
  * Save and restart an expired event. Called by NMI contexts,
  * so it has to be careful about preempting normal event ops:
@@ -3290,6 +3293,8 @@ static __initconst const struct x86_pmu intel_pmu = {
        .enable_all             = intel_pmu_enable_all,
        .enable                 = intel_pmu_enable_event,
        .disable                = intel_pmu_disable_event,
+       .add                    = intel_pmu_add_event,
+       .del                    = intel_pmu_del_event,
        .hw_config              = intel_pmu_hw_config,
        .schedule_events        = x86_schedule_events,
        .eventsel               = MSR_ARCH_PERFMON_EVENTSEL0,
index 783c49ddef29c2336d458d1741fbdde5f69fecec..8f82b02934fa701a451ee5e7d8f76193eaefc2fa 100644 (file)
@@ -458,6 +458,11 @@ static void __intel_cqm_event_count(void *info);
 static void init_mbm_sample(u32 rmid, u32 evt_type);
 static void __intel_mbm_event_count(void *info);
 
+static bool is_cqm_event(int e)
+{
+       return (e == QOS_L3_OCCUP_EVENT_ID);
+}
+
 static bool is_mbm_event(int e)
 {
        return (e >= QOS_MBM_TOTAL_EVENT_ID && e <= QOS_MBM_LOCAL_EVENT_ID);
@@ -1366,6 +1371,10 @@ static int intel_cqm_event_init(struct perf_event *event)
             (event->attr.config > QOS_MBM_LOCAL_EVENT_ID))
                return -EINVAL;
 
+       if ((is_cqm_event(event->attr.config) && !cqm_enabled) ||
+           (is_mbm_event(event->attr.config) && !mbm_enabled))
+               return -EINVAL;
+
        /* unsupported modes and filters */
        if (event->attr.exclude_user   ||
            event->attr.exclude_kernel ||
index 7ce9f3f669e63d2bd4bbf15db801cf8bc5fa7565..0319311dbdbb548eef4f5b2431c285deea5dd00c 100644 (file)
@@ -806,9 +806,65 @@ struct event_constraint *intel_pebs_constraints(struct perf_event *event)
        return &emptyconstraint;
 }
 
-static inline bool pebs_is_enabled(struct cpu_hw_events *cpuc)
+/*
+ * We need the sched_task callback even for per-cpu events when we use
+ * the large interrupt threshold, such that we can provide PID and TID
+ * to PEBS samples.
+ */
+static inline bool pebs_needs_sched_cb(struct cpu_hw_events *cpuc)
+{
+       return cpuc->n_pebs && (cpuc->n_pebs == cpuc->n_large_pebs);
+}
+
+static inline void pebs_update_threshold(struct cpu_hw_events *cpuc)
+{
+       struct debug_store *ds = cpuc->ds;
+       u64 threshold;
+
+       if (cpuc->n_pebs == cpuc->n_large_pebs) {
+               threshold = ds->pebs_absolute_maximum -
+                       x86_pmu.max_pebs_events * x86_pmu.pebs_record_size;
+       } else {
+               threshold = ds->pebs_buffer_base + x86_pmu.pebs_record_size;
+       }
+
+       ds->pebs_interrupt_threshold = threshold;
+}
+
+static void
+pebs_update_state(bool needed_cb, struct cpu_hw_events *cpuc, struct pmu *pmu)
 {
-       return (cpuc->pebs_enabled & ((1ULL << MAX_PEBS_EVENTS) - 1));
+       /*
+        * Make sure we get updated with the first PEBS
+        * event. It will trigger also during removal, but
+        * that does not hurt:
+        */
+       bool update = cpuc->n_pebs == 1;
+
+       if (needed_cb != pebs_needs_sched_cb(cpuc)) {
+               if (!needed_cb)
+                       perf_sched_cb_inc(pmu);
+               else
+                       perf_sched_cb_dec(pmu);
+
+               update = true;
+       }
+
+       if (update)
+               pebs_update_threshold(cpuc);
+}
+
+void intel_pmu_pebs_add(struct perf_event *event)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       struct hw_perf_event *hwc = &event->hw;
+       bool needed_cb = pebs_needs_sched_cb(cpuc);
+
+       cpuc->n_pebs++;
+       if (hwc->flags & PERF_X86_EVENT_FREERUNNING)
+               cpuc->n_large_pebs++;
+
+       pebs_update_state(needed_cb, cpuc, event->ctx->pmu);
 }
 
 void intel_pmu_pebs_enable(struct perf_event *event)
@@ -816,12 +872,9 @@ void intel_pmu_pebs_enable(struct perf_event *event)
        struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
        struct hw_perf_event *hwc = &event->hw;
        struct debug_store *ds = cpuc->ds;
-       bool first_pebs;
-       u64 threshold;
 
        hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT;
 
-       first_pebs = !pebs_is_enabled(cpuc);
        cpuc->pebs_enabled |= 1ULL << hwc->idx;
 
        if (event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT)
@@ -830,46 +883,34 @@ void intel_pmu_pebs_enable(struct perf_event *event)
                cpuc->pebs_enabled |= 1ULL << 63;
 
        /*
-        * When the event is constrained enough we can use a larger
-        * threshold and run the event with less frequent PMI.
+        * Use auto-reload if possible to save a MSR write in the PMI.
+        * This must be done in pmu::start(), because PERF_EVENT_IOC_PERIOD.
         */
-       if (hwc->flags & PERF_X86_EVENT_FREERUNNING) {
-               threshold = ds->pebs_absolute_maximum -
-                       x86_pmu.max_pebs_events * x86_pmu.pebs_record_size;
-
-               if (first_pebs)
-                       perf_sched_cb_inc(event->ctx->pmu);
-       } else {
-               threshold = ds->pebs_buffer_base + x86_pmu.pebs_record_size;
-
-               /*
-                * If not all events can use larger buffer,
-                * roll back to threshold = 1
-                */
-               if (!first_pebs &&
-                   (ds->pebs_interrupt_threshold > threshold))
-                       perf_sched_cb_dec(event->ctx->pmu);
-       }
-
-       /* Use auto-reload if possible to save a MSR write in the PMI */
        if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) {
                ds->pebs_event_reset[hwc->idx] =
                        (u64)(-hwc->sample_period) & x86_pmu.cntval_mask;
        }
+}
+
+void intel_pmu_pebs_del(struct perf_event *event)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       struct hw_perf_event *hwc = &event->hw;
+       bool needed_cb = pebs_needs_sched_cb(cpuc);
 
-       if (first_pebs || ds->pebs_interrupt_threshold > threshold)
-               ds->pebs_interrupt_threshold = threshold;
+       cpuc->n_pebs--;
+       if (hwc->flags & PERF_X86_EVENT_FREERUNNING)
+               cpuc->n_large_pebs--;
+
+       pebs_update_state(needed_cb, cpuc, event->ctx->pmu);
 }
 
 void intel_pmu_pebs_disable(struct perf_event *event)
 {
        struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
        struct hw_perf_event *hwc = &event->hw;
-       struct debug_store *ds = cpuc->ds;
-       bool large_pebs = ds->pebs_interrupt_threshold >
-               ds->pebs_buffer_base + x86_pmu.pebs_record_size;
 
-       if (large_pebs)
+       if (cpuc->n_pebs == cpuc->n_large_pebs)
                intel_pmu_drain_pebs_buffer();
 
        cpuc->pebs_enabled &= ~(1ULL << hwc->idx);
@@ -879,9 +920,6 @@ void intel_pmu_pebs_disable(struct perf_event *event)
        else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST)
                cpuc->pebs_enabled &= ~(1ULL << 63);
 
-       if (large_pebs && !pebs_is_enabled(cpuc))
-               perf_sched_cb_dec(event->ctx->pmu);
-
        if (cpuc->enabled)
                wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
 
@@ -1274,18 +1312,18 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
                struct pebs_record_nhm *p = at;
                u64 pebs_status;
 
-               /* PEBS v3 has accurate status bits */
+               pebs_status = p->status & cpuc->pebs_enabled;
+               pebs_status &= (1ULL << x86_pmu.max_pebs_events) - 1;
+
+               /* PEBS v3 has more accurate status bits */
                if (x86_pmu.intel_cap.pebs_format >= 3) {
-                       for_each_set_bit(bit, (unsigned long *)&p->status,
-                                        MAX_PEBS_EVENTS)
+                       for_each_set_bit(bit, (unsigned long *)&pebs_status,
+                                        x86_pmu.max_pebs_events)
                                counts[bit]++;
 
                        continue;
                }
 
-               pebs_status = p->status & cpuc->pebs_enabled;
-               pebs_status &= (1ULL << x86_pmu.max_pebs_events) - 1;
-
                /*
                 * On some CPUs the PEBS status can be zero when PEBS is
                 * racing with clearing of GLOBAL_STATUS.
@@ -1333,8 +1371,11 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
                        continue;
 
                event = cpuc->events[bit];
-               WARN_ON_ONCE(!event);
-               WARN_ON_ONCE(!event->attr.precise_ip);
+               if (WARN_ON_ONCE(!event))
+                       continue;
+
+               if (WARN_ON_ONCE(!event->attr.precise_ip))
+                       continue;
 
                /* log dropped samples number */
                if (error[bit])
index 707d358e0dff59ed5871bc49ceda0462c4f818be..fc6cf21c535e19f4c07deccf4a6444199e5ae39c 100644 (file)
@@ -380,7 +380,6 @@ static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx)
 
 void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in)
 {
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
        struct x86_perf_task_context *task_ctx;
 
        /*
@@ -390,31 +389,21 @@ void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in)
         */
        task_ctx = ctx ? ctx->task_ctx_data : NULL;
        if (task_ctx) {
-               if (sched_in) {
+               if (sched_in)
                        __intel_pmu_lbr_restore(task_ctx);
-                       cpuc->lbr_context = ctx;
-               } else {
+               else
                        __intel_pmu_lbr_save(task_ctx);
-               }
                return;
        }
 
        /*
-        * When sampling the branck stack in system-wide, it may be
-        * necessary to flush the stack on context switch. This happens
-        * when the branch stack does not tag its entries with the pid
-        * of the current task. Otherwise it becomes impossible to
-        * associate a branch entry with a task. This ambiguity is more
-        * likely to appear when the branch stack supports priv level
-        * filtering and the user sets it to monitor only at the user
-        * level (which could be a useful measurement in system-wide
-        * mode). In that case, the risk is high of having a branch
-        * stack with branch from multiple tasks.
-        */
-       if (sched_in) {
+        * Since a context switch can flip the address space and LBR entries
+        * are not tagged with an identifier, we need to wipe the LBR, even for
+        * per-cpu events. You simply cannot resolve the branches from the old
+        * address space.
+        */
+       if (sched_in)
                intel_pmu_lbr_reset();
-               cpuc->lbr_context = ctx;
-       }
 }
 
 static inline bool branch_user_callstack(unsigned br_sel)
@@ -422,7 +411,7 @@ static inline bool branch_user_callstack(unsigned br_sel)
        return (br_sel & X86_BR_USER) && (br_sel & X86_BR_CALL_STACK);
 }
 
-void intel_pmu_lbr_enable(struct perf_event *event)
+void intel_pmu_lbr_add(struct perf_event *event)
 {
        struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
        struct x86_perf_task_context *task_ctx;
@@ -430,27 +419,38 @@ void intel_pmu_lbr_enable(struct perf_event *event)
        if (!x86_pmu.lbr_nr)
                return;
 
-       /*
-        * Reset the LBR stack if we changed task context to
-        * avoid data leaks.
-        */
-       if (event->ctx->task && cpuc->lbr_context != event->ctx) {
-               intel_pmu_lbr_reset();
-               cpuc->lbr_context = event->ctx;
-       }
        cpuc->br_sel = event->hw.branch_reg.reg;
 
-       if (branch_user_callstack(cpuc->br_sel) && event->ctx &&
-                                       event->ctx->task_ctx_data) {
+       if (branch_user_callstack(cpuc->br_sel) && event->ctx->task_ctx_data) {
                task_ctx = event->ctx->task_ctx_data;
                task_ctx->lbr_callstack_users++;
        }
 
-       cpuc->lbr_users++;
+       /*
+        * Request pmu::sched_task() callback, which will fire inside the
+        * regular perf event scheduling, so that call will:
+        *
+        *  - restore or wipe; when LBR-callstack,
+        *  - wipe; otherwise,
+        *
+        * when this is from __perf_event_task_sched_in().
+        *
+        * However, if this is from perf_install_in_context(), no such callback
+        * will follow and we'll need to reset the LBR here if this is the
+        * first LBR event.
+        *
+        * The problem is, we cannot tell these cases apart... but we can
+        * exclude the biggest chunk of cases by looking at
+        * event->total_time_running. An event that has accrued runtime cannot
+        * be 'new'. Conversely, a new event can get installed through the
+        * context switch path for the first time.
+        */
        perf_sched_cb_inc(event->ctx->pmu);
+       if (!cpuc->lbr_users++ && !event->total_time_running)
+               intel_pmu_lbr_reset();
 }
 
-void intel_pmu_lbr_disable(struct perf_event *event)
+void intel_pmu_lbr_del(struct perf_event *event)
 {
        struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
        struct x86_perf_task_context *task_ctx;
@@ -467,12 +467,6 @@ void intel_pmu_lbr_disable(struct perf_event *event)
        cpuc->lbr_users--;
        WARN_ON_ONCE(cpuc->lbr_users < 0);
        perf_sched_cb_dec(event->ctx->pmu);
-
-       if (cpuc->enabled && !cpuc->lbr_users) {
-               __intel_pmu_lbr_disable();
-               /* avoid stale pointer */
-               cpuc->lbr_context = NULL;
-       }
 }
 
 void intel_pmu_lbr_enable_all(bool pmi)
index 28865938aadf267e42829c3393cc405b0bf0672e..b0f0e835a770f7ee959681513bd7e41af39827a0 100644 (file)
@@ -357,6 +357,8 @@ static int rapl_pmu_event_init(struct perf_event *event)
        if (event->cpu < 0)
                return -EINVAL;
 
+       event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG;
+
        /*
         * check event is known (determines counter)
         */
@@ -765,6 +767,8 @@ static const struct x86_cpu_id rapl_cpu_match[] __initconst = {
        X86_RAPL_MODEL_MATCH(INTEL_FAM6_SKYLAKE_MOBILE,  skl_rapl_init),
        X86_RAPL_MODEL_MATCH(INTEL_FAM6_SKYLAKE_DESKTOP, skl_rapl_init),
        X86_RAPL_MODEL_MATCH(INTEL_FAM6_SKYLAKE_X,       hsx_rapl_init),
+
+       X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT, hsw_rapl_init),
        {},
 };
 
index 463dc7a5a6c3b1ad8aff48aa6507db85f857463c..d9844cc74486e602c7b768e225856323a37025b8 100644 (file)
@@ -664,6 +664,8 @@ static int uncore_pmu_event_init(struct perf_event *event)
        event->cpu = box->cpu;
        event->pmu_private = box;
 
+       event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG;
+
        event->hw.idx = -1;
        event->hw.last_tag = ~0ULL;
        event->hw.extra_reg.idx = EXTRA_REG_NONE;
@@ -683,7 +685,8 @@ static int uncore_pmu_event_init(struct perf_event *event)
                /* fixed counters have event field hardcoded to zero */
                hwc->config = 0ULL;
        } else {
-               hwc->config = event->attr.config & pmu->type->event_mask;
+               hwc->config = event->attr.config &
+                             (pmu->type->event_mask | ((u64)pmu->type->event_mask_ext << 32));
                if (pmu->type->ops->hw_config) {
                        ret = pmu->type->ops->hw_config(box, event);
                        if (ret)
@@ -1321,6 +1324,11 @@ static const struct intel_uncore_init_fun skl_uncore_init __initconst = {
        .pci_init = skl_uncore_pci_init,
 };
 
+static const struct intel_uncore_init_fun skx_uncore_init __initconst = {
+       .cpu_init = skx_uncore_cpu_init,
+       .pci_init = skx_uncore_pci_init,
+};
+
 static const struct x86_cpu_id intel_uncore_match[] __initconst = {
        X86_UNCORE_MODEL_MATCH(INTEL_FAM6_NEHALEM_EP,     nhm_uncore_init),
        X86_UNCORE_MODEL_MATCH(INTEL_FAM6_NEHALEM,        nhm_uncore_init),
@@ -1343,6 +1351,7 @@ static const struct x86_cpu_id intel_uncore_match[] __initconst = {
        X86_UNCORE_MODEL_MATCH(INTEL_FAM6_XEON_PHI_KNL,   knl_uncore_init),
        X86_UNCORE_MODEL_MATCH(INTEL_FAM6_SKYLAKE_DESKTOP,skl_uncore_init),
        X86_UNCORE_MODEL_MATCH(INTEL_FAM6_SKYLAKE_MOBILE, skl_uncore_init),
+       X86_UNCORE_MODEL_MATCH(INTEL_FAM6_SKYLAKE_X,      skx_uncore_init),
        {},
 };
 
index 78b9c23e2d8defc2325c7edbd2d5b339605a75dc..ad986c1e29bccd7d5303d94bcbf2caa9223fcf74 100644 (file)
@@ -44,6 +44,7 @@ struct intel_uncore_type {
        unsigned perf_ctr;
        unsigned event_ctl;
        unsigned event_mask;
+       unsigned event_mask_ext;
        unsigned fixed_ctr;
        unsigned fixed_ctl;
        unsigned box_ctl;
@@ -120,6 +121,7 @@ struct intel_uncore_box {
 };
 
 #define UNCORE_BOX_FLAG_INITIATED      0
+#define UNCORE_BOX_FLAG_CTL_OFFS8      1 /* event config registers are 8-byte apart */
 
 struct uncore_event_desc {
        struct kobj_attribute attr;
@@ -172,6 +174,9 @@ static inline unsigned uncore_pci_fixed_ctr(struct intel_uncore_box *box)
 static inline
 unsigned uncore_pci_event_ctl(struct intel_uncore_box *box, int idx)
 {
+       if (test_bit(UNCORE_BOX_FLAG_CTL_OFFS8, &box->flags))
+               return idx * 8 + box->pmu->type->event_ctl;
+
        return idx * 4 + box->pmu->type->event_ctl;
 }
 
@@ -377,6 +382,8 @@ int bdx_uncore_pci_init(void);
 void bdx_uncore_cpu_init(void);
 int knl_uncore_pci_init(void);
 void knl_uncore_cpu_init(void);
+int skx_uncore_pci_init(void);
+void skx_uncore_cpu_init(void);
 
 /* perf_event_intel_uncore_nhmex.c */
 void nhmex_uncore_cpu_init(void);
index 9d35ec0cb8fc916ba3b4b63f5bdb1b6ebda5de55..5f845eef9a4d682a3598c13946dbdec9fb70adeb 100644 (file)
@@ -388,6 +388,8 @@ static int snb_uncore_imc_event_init(struct perf_event *event)
        event->cpu = box->cpu;
        event->pmu_private = box;
 
+       event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG;
+
        event->hw.idx = -1;
        event->hw.last_tag = ~0ULL;
        event->hw.extra_reg.idx = EXTRA_REG_NONE;
index 8aee83bcf71f2dc5a380009957d0858a7b4a2507..272427700d48deebdbcb582b1bd4af143e44a58c 100644 (file)
@@ -1,6 +1,10 @@
 /* SandyBridge-EP/IvyTown uncore support */
 #include "uncore.h"
 
+/* SNB-EP pci bus to socket mapping */
+#define SNBEP_CPUNODEID                        0x40
+#define SNBEP_GIDNIDMAP                        0x54
+
 /* SNB-EP Box level control */
 #define SNBEP_PMON_BOX_CTL_RST_CTRL    (1 << 0)
 #define SNBEP_PMON_BOX_CTL_RST_CTRS    (1 << 1)
                                 SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \
                                 SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET)
 
+/* SKX pci bus to socket mapping */
+#define SKX_CPUNODEID                  0xc0
+#define SKX_GIDNIDMAP                  0xd4
+
+/* SKX CHA */
+#define SKX_CHA_MSR_PMON_BOX_FILTER_TID                (0x1ffULL << 0)
+#define SKX_CHA_MSR_PMON_BOX_FILTER_LINK       (0xfULL << 9)
+#define SKX_CHA_MSR_PMON_BOX_FILTER_STATE      (0x3ffULL << 17)
+#define SKX_CHA_MSR_PMON_BOX_FILTER_REM                (0x1ULL << 32)
+#define SKX_CHA_MSR_PMON_BOX_FILTER_LOC                (0x1ULL << 33)
+#define SKX_CHA_MSR_PMON_BOX_FILTER_ALL_OPC    (0x1ULL << 35)
+#define SKX_CHA_MSR_PMON_BOX_FILTER_NM         (0x1ULL << 36)
+#define SKX_CHA_MSR_PMON_BOX_FILTER_NOT_NM     (0x1ULL << 37)
+#define SKX_CHA_MSR_PMON_BOX_FILTER_OPC0       (0x3ffULL << 41)
+#define SKX_CHA_MSR_PMON_BOX_FILTER_OPC1       (0x3ffULL << 51)
+#define SKX_CHA_MSR_PMON_BOX_FILTER_C6         (0x1ULL << 61)
+#define SKX_CHA_MSR_PMON_BOX_FILTER_NC         (0x1ULL << 62)
+#define SKX_CHA_MSR_PMON_BOX_FILTER_ISOC       (0x1ULL << 63)
+
+/* SKX IIO */
+#define SKX_IIO0_MSR_PMON_CTL0         0xa48
+#define SKX_IIO0_MSR_PMON_CTR0         0xa41
+#define SKX_IIO0_MSR_PMON_BOX_CTL      0xa40
+#define SKX_IIO_MSR_OFFSET             0x20
+
+#define SKX_PMON_CTL_TRESH_MASK                (0xff << 24)
+#define SKX_PMON_CTL_TRESH_MASK_EXT    (0xf)
+#define SKX_PMON_CTL_CH_MASK           (0xff << 4)
+#define SKX_PMON_CTL_FC_MASK           (0x7 << 12)
+#define SKX_IIO_PMON_RAW_EVENT_MASK    (SNBEP_PMON_CTL_EV_SEL_MASK | \
+                                        SNBEP_PMON_CTL_UMASK_MASK | \
+                                        SNBEP_PMON_CTL_EDGE_DET | \
+                                        SNBEP_PMON_CTL_INVERT | \
+                                        SKX_PMON_CTL_TRESH_MASK)
+#define SKX_IIO_PMON_RAW_EVENT_MASK_EXT        (SKX_PMON_CTL_TRESH_MASK_EXT | \
+                                        SKX_PMON_CTL_CH_MASK | \
+                                        SKX_PMON_CTL_FC_MASK)
+
+/* SKX IRP */
+#define SKX_IRP0_MSR_PMON_CTL0         0xa5b
+#define SKX_IRP0_MSR_PMON_CTR0         0xa59
+#define SKX_IRP0_MSR_PMON_BOX_CTL      0xa58
+#define SKX_IRP_MSR_OFFSET             0x20
+
+/* SKX UPI */
+#define SKX_UPI_PCI_PMON_CTL0          0x350
+#define SKX_UPI_PCI_PMON_CTR0          0x318
+#define SKX_UPI_PCI_PMON_BOX_CTL       0x378
+#define SKX_PMON_CTL_UMASK_EXT         0xff
+
+/* SKX M2M */
+#define SKX_M2M_PCI_PMON_CTL0          0x228
+#define SKX_M2M_PCI_PMON_CTR0          0x200
+#define SKX_M2M_PCI_PMON_BOX_CTL       0x258
+
 DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
 DEFINE_UNCORE_FORMAT_ATTR(event2, event, "config:0-6");
 DEFINE_UNCORE_FORMAT_ATTR(event_ext, event, "config:0-7,21");
 DEFINE_UNCORE_FORMAT_ATTR(use_occ_ctr, use_occ_ctr, "config:7");
 DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
+DEFINE_UNCORE_FORMAT_ATTR(umask_ext, umask, "config:8-15,32-39");
 DEFINE_UNCORE_FORMAT_ATTR(qor, qor, "config:16");
 DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
 DEFINE_UNCORE_FORMAT_ATTR(tid_en, tid_en, "config:19");
 DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23");
+DEFINE_UNCORE_FORMAT_ATTR(thresh9, thresh, "config:24-35");
 DEFINE_UNCORE_FORMAT_ATTR(thresh8, thresh, "config:24-31");
 DEFINE_UNCORE_FORMAT_ATTR(thresh6, thresh, "config:24-29");
 DEFINE_UNCORE_FORMAT_ATTR(thresh5, thresh, "config:24-28");
@@ -280,6 +341,8 @@ DEFINE_UNCORE_FORMAT_ATTR(occ_sel, occ_sel, "config:14-15");
 DEFINE_UNCORE_FORMAT_ATTR(occ_invert, occ_invert, "config:30");
 DEFINE_UNCORE_FORMAT_ATTR(occ_edge, occ_edge, "config:14-51");
 DEFINE_UNCORE_FORMAT_ATTR(occ_edge_det, occ_edge_det, "config:31");
+DEFINE_UNCORE_FORMAT_ATTR(ch_mask, ch_mask, "config:36-43");
+DEFINE_UNCORE_FORMAT_ATTR(fc_mask, fc_mask, "config:44-46");
 DEFINE_UNCORE_FORMAT_ATTR(filter_tid, filter_tid, "config1:0-4");
 DEFINE_UNCORE_FORMAT_ATTR(filter_tid2, filter_tid, "config1:0");
 DEFINE_UNCORE_FORMAT_ATTR(filter_tid3, filter_tid, "config1:0-5");
@@ -288,18 +351,26 @@ DEFINE_UNCORE_FORMAT_ATTR(filter_cid, filter_cid, "config1:5");
 DEFINE_UNCORE_FORMAT_ATTR(filter_link, filter_link, "config1:5-8");
 DEFINE_UNCORE_FORMAT_ATTR(filter_link2, filter_link, "config1:6-8");
 DEFINE_UNCORE_FORMAT_ATTR(filter_link3, filter_link, "config1:12");
+DEFINE_UNCORE_FORMAT_ATTR(filter_link4, filter_link, "config1:9-12");
 DEFINE_UNCORE_FORMAT_ATTR(filter_nid, filter_nid, "config1:10-17");
 DEFINE_UNCORE_FORMAT_ATTR(filter_nid2, filter_nid, "config1:32-47");
 DEFINE_UNCORE_FORMAT_ATTR(filter_state, filter_state, "config1:18-22");
 DEFINE_UNCORE_FORMAT_ATTR(filter_state2, filter_state, "config1:17-22");
 DEFINE_UNCORE_FORMAT_ATTR(filter_state3, filter_state, "config1:17-23");
 DEFINE_UNCORE_FORMAT_ATTR(filter_state4, filter_state, "config1:18-20");
+DEFINE_UNCORE_FORMAT_ATTR(filter_state5, filter_state, "config1:17-26");
+DEFINE_UNCORE_FORMAT_ATTR(filter_rem, filter_rem, "config1:32");
+DEFINE_UNCORE_FORMAT_ATTR(filter_loc, filter_loc, "config1:33");
+DEFINE_UNCORE_FORMAT_ATTR(filter_nm, filter_nm, "config1:36");
+DEFINE_UNCORE_FORMAT_ATTR(filter_not_nm, filter_not_nm, "config1:37");
 DEFINE_UNCORE_FORMAT_ATTR(filter_local, filter_local, "config1:33");
 DEFINE_UNCORE_FORMAT_ATTR(filter_all_op, filter_all_op, "config1:35");
 DEFINE_UNCORE_FORMAT_ATTR(filter_nnm, filter_nnm, "config1:37");
 DEFINE_UNCORE_FORMAT_ATTR(filter_opc, filter_opc, "config1:23-31");
 DEFINE_UNCORE_FORMAT_ATTR(filter_opc2, filter_opc, "config1:52-60");
 DEFINE_UNCORE_FORMAT_ATTR(filter_opc3, filter_opc, "config1:41-60");
+DEFINE_UNCORE_FORMAT_ATTR(filter_opc_0, filter_opc0, "config1:41-50");
+DEFINE_UNCORE_FORMAT_ATTR(filter_opc_1, filter_opc1, "config1:51-60");
 DEFINE_UNCORE_FORMAT_ATTR(filter_nc, filter_nc, "config1:62");
 DEFINE_UNCORE_FORMAT_ATTR(filter_c6, filter_c6, "config1:61");
 DEFINE_UNCORE_FORMAT_ATTR(filter_isoc, filter_isoc, "config1:63");
@@ -1153,7 +1224,7 @@ static struct pci_driver snbep_uncore_pci_driver = {
 /*
  * build pci bus to socket mapping
  */
-static int snbep_pci2phy_map_init(int devid)
+static int snbep_pci2phy_map_init(int devid, int nodeid_loc, int idmap_loc, bool reverse)
 {
        struct pci_dev *ubox_dev = NULL;
        int i, bus, nodeid, segment;
@@ -1168,12 +1239,12 @@ static int snbep_pci2phy_map_init(int devid)
                        break;
                bus = ubox_dev->bus->number;
                /* get the Node ID of the local register */
-               err = pci_read_config_dword(ubox_dev, 0x40, &config);
+               err = pci_read_config_dword(ubox_dev, nodeid_loc, &config);
                if (err)
                        break;
                nodeid = config;
                /* get the Node ID mapping */
-               err = pci_read_config_dword(ubox_dev, 0x54, &config);
+               err = pci_read_config_dword(ubox_dev, idmap_loc, &config);
                if (err)
                        break;
 
@@ -1207,11 +1278,20 @@ static int snbep_pci2phy_map_init(int devid)
                raw_spin_lock(&pci2phy_map_lock);
                list_for_each_entry(map, &pci2phy_map_head, list) {
                        i = -1;
-                       for (bus = 255; bus >= 0; bus--) {
-                               if (map->pbus_to_physid[bus] >= 0)
-                                       i = map->pbus_to_physid[bus];
-                               else
-                                       map->pbus_to_physid[bus] = i;
+                       if (reverse) {
+                               for (bus = 255; bus >= 0; bus--) {
+                                       if (map->pbus_to_physid[bus] >= 0)
+                                               i = map->pbus_to_physid[bus];
+                                       else
+                                               map->pbus_to_physid[bus] = i;
+                               }
+                       } else {
+                               for (bus = 0; bus <= 255; bus++) {
+                                       if (map->pbus_to_physid[bus] >= 0)
+                                               i = map->pbus_to_physid[bus];
+                                       else
+                                               map->pbus_to_physid[bus] = i;
+                               }
                        }
                }
                raw_spin_unlock(&pci2phy_map_lock);
@@ -1224,7 +1304,7 @@ static int snbep_pci2phy_map_init(int devid)
 
 int snbep_uncore_pci_init(void)
 {
-       int ret = snbep_pci2phy_map_init(0x3ce0);
+       int ret = snbep_pci2phy_map_init(0x3ce0, SNBEP_CPUNODEID, SNBEP_GIDNIDMAP, true);
        if (ret)
                return ret;
        uncore_pci_uncores = snbep_pci_uncores;
@@ -1788,7 +1868,7 @@ static struct pci_driver ivbep_uncore_pci_driver = {
 
 int ivbep_uncore_pci_init(void)
 {
-       int ret = snbep_pci2phy_map_init(0x0e1e);
+       int ret = snbep_pci2phy_map_init(0x0e1e, SNBEP_CPUNODEID, SNBEP_GIDNIDMAP, true);
        if (ret)
                return ret;
        uncore_pci_uncores = ivbep_pci_uncores;
@@ -2897,7 +2977,7 @@ static struct pci_driver hswep_uncore_pci_driver = {
 
 int hswep_uncore_pci_init(void)
 {
-       int ret = snbep_pci2phy_map_init(0x2f1e);
+       int ret = snbep_pci2phy_map_init(0x2f1e, SNBEP_CPUNODEID, SNBEP_GIDNIDMAP, true);
        if (ret)
                return ret;
        uncore_pci_uncores = hswep_pci_uncores;
@@ -3186,7 +3266,7 @@ static struct pci_driver bdx_uncore_pci_driver = {
 
 int bdx_uncore_pci_init(void)
 {
-       int ret = snbep_pci2phy_map_init(0x6f1e);
+       int ret = snbep_pci2phy_map_init(0x6f1e, SNBEP_CPUNODEID, SNBEP_GIDNIDMAP, true);
 
        if (ret)
                return ret;
@@ -3196,3 +3276,525 @@ int bdx_uncore_pci_init(void)
 }
 
 /* end of BDX uncore support */
+
+/* SKX uncore support */
+
+static struct intel_uncore_type skx_uncore_ubox = {
+       .name                   = "ubox",
+       .num_counters           = 2,
+       .num_boxes              = 1,
+       .perf_ctr_bits          = 48,
+       .fixed_ctr_bits         = 48,
+       .perf_ctr               = HSWEP_U_MSR_PMON_CTR0,
+       .event_ctl              = HSWEP_U_MSR_PMON_CTL0,
+       .event_mask             = SNBEP_U_MSR_PMON_RAW_EVENT_MASK,
+       .fixed_ctr              = HSWEP_U_MSR_PMON_UCLK_FIXED_CTR,
+       .fixed_ctl              = HSWEP_U_MSR_PMON_UCLK_FIXED_CTL,
+       .ops                    = &ivbep_uncore_msr_ops,
+       .format_group           = &ivbep_uncore_ubox_format_group,
+};
+
+static struct attribute *skx_uncore_cha_formats_attr[] = {
+       &format_attr_event.attr,
+       &format_attr_umask.attr,
+       &format_attr_edge.attr,
+       &format_attr_tid_en.attr,
+       &format_attr_inv.attr,
+       &format_attr_thresh8.attr,
+       &format_attr_filter_tid4.attr,
+       &format_attr_filter_link4.attr,
+       &format_attr_filter_state5.attr,
+       &format_attr_filter_rem.attr,
+       &format_attr_filter_loc.attr,
+       &format_attr_filter_nm.attr,
+       &format_attr_filter_all_op.attr,
+       &format_attr_filter_not_nm.attr,
+       &format_attr_filter_opc_0.attr,
+       &format_attr_filter_opc_1.attr,
+       &format_attr_filter_nc.attr,
+       &format_attr_filter_c6.attr,
+       &format_attr_filter_isoc.attr,
+       NULL,
+};
+
+static struct attribute_group skx_uncore_chabox_format_group = {
+       .name = "format",
+       .attrs = skx_uncore_cha_formats_attr,
+};
+
+static struct event_constraint skx_uncore_chabox_constraints[] = {
+       UNCORE_EVENT_CONSTRAINT(0x11, 0x1),
+       UNCORE_EVENT_CONSTRAINT(0x36, 0x1),
+       EVENT_CONSTRAINT_END
+};
+
+static struct extra_reg skx_uncore_cha_extra_regs[] = {
+       SNBEP_CBO_EVENT_EXTRA_REG(0x0334, 0xffff, 0x4),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x1134, 0xffff, 0x4),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x2134, 0xffff, 0x4),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x8134, 0xffff, 0x4),
+};
+
+static u64 skx_cha_filter_mask(int fields)
+{
+       u64 mask = 0;
+
+       if (fields & 0x1)
+               mask |= SKX_CHA_MSR_PMON_BOX_FILTER_TID;
+       if (fields & 0x2)
+               mask |= SKX_CHA_MSR_PMON_BOX_FILTER_LINK;
+       if (fields & 0x4)
+               mask |= SKX_CHA_MSR_PMON_BOX_FILTER_STATE;
+       return mask;
+}
+
+static struct event_constraint *
+skx_cha_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+       return __snbep_cbox_get_constraint(box, event, skx_cha_filter_mask);
+}
+
+static int skx_cha_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+       struct extra_reg *er;
+       int idx = 0;
+
+       for (er = skx_uncore_cha_extra_regs; er->msr; er++) {
+               if (er->event != (event->hw.config & er->config_mask))
+                       continue;
+               idx |= er->idx;
+       }
+
+       if (idx) {
+               reg1->reg = HSWEP_C0_MSR_PMON_BOX_FILTER0 +
+                           HSWEP_CBO_MSR_OFFSET * box->pmu->pmu_idx;
+               reg1->config = event->attr.config1 & skx_cha_filter_mask(idx);
+               reg1->idx = idx;
+       }
+       return 0;
+}
+
+static struct intel_uncore_ops skx_uncore_chabox_ops = {
+       /* There is no frz_en for chabox ctl */
+       .init_box               = ivbep_uncore_msr_init_box,
+       .disable_box            = snbep_uncore_msr_disable_box,
+       .enable_box             = snbep_uncore_msr_enable_box,
+       .disable_event          = snbep_uncore_msr_disable_event,
+       .enable_event           = hswep_cbox_enable_event,
+       .read_counter           = uncore_msr_read_counter,
+       .hw_config              = skx_cha_hw_config,
+       .get_constraint         = skx_cha_get_constraint,
+       .put_constraint         = snbep_cbox_put_constraint,
+};
+
+static struct intel_uncore_type skx_uncore_chabox = {
+       .name                   = "cha",
+       .num_counters           = 4,
+       .perf_ctr_bits          = 48,
+       .event_ctl              = HSWEP_C0_MSR_PMON_CTL0,
+       .perf_ctr               = HSWEP_C0_MSR_PMON_CTR0,
+       .event_mask             = HSWEP_S_MSR_PMON_RAW_EVENT_MASK,
+       .box_ctl                = HSWEP_C0_MSR_PMON_BOX_CTL,
+       .msr_offset             = HSWEP_CBO_MSR_OFFSET,
+       .num_shared_regs        = 1,
+       .constraints            = skx_uncore_chabox_constraints,
+       .ops                    = &skx_uncore_chabox_ops,
+       .format_group           = &skx_uncore_chabox_format_group,
+};
+
+static struct attribute *skx_uncore_iio_formats_attr[] = {
+       &format_attr_event.attr,
+       &format_attr_umask.attr,
+       &format_attr_edge.attr,
+       &format_attr_inv.attr,
+       &format_attr_thresh9.attr,
+       &format_attr_ch_mask.attr,
+       &format_attr_fc_mask.attr,
+       NULL,
+};
+
+static struct attribute_group skx_uncore_iio_format_group = {
+       .name = "format",
+       .attrs = skx_uncore_iio_formats_attr,
+};
+
+static struct event_constraint skx_uncore_iio_constraints[] = {
+       UNCORE_EVENT_CONSTRAINT(0x83, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x88, 0xc),
+       UNCORE_EVENT_CONSTRAINT(0x95, 0xc),
+       UNCORE_EVENT_CONSTRAINT(0xc0, 0xc),
+       UNCORE_EVENT_CONSTRAINT(0xc5, 0xc),
+       UNCORE_EVENT_CONSTRAINT(0xd4, 0xc),
+       EVENT_CONSTRAINT_END
+};
+
+static void skx_iio_enable_event(struct intel_uncore_box *box,
+                                struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+
+       wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
+}
+
+static struct intel_uncore_ops skx_uncore_iio_ops = {
+       .init_box               = ivbep_uncore_msr_init_box,
+       .disable_box            = snbep_uncore_msr_disable_box,
+       .enable_box             = snbep_uncore_msr_enable_box,
+       .disable_event          = snbep_uncore_msr_disable_event,
+       .enable_event           = skx_iio_enable_event,
+       .read_counter           = uncore_msr_read_counter,
+};
+
+static struct intel_uncore_type skx_uncore_iio = {
+       .name                   = "iio",
+       .num_counters           = 4,
+       .num_boxes              = 5,
+       .perf_ctr_bits          = 48,
+       .event_ctl              = SKX_IIO0_MSR_PMON_CTL0,
+       .perf_ctr               = SKX_IIO0_MSR_PMON_CTR0,
+       .event_mask             = SKX_IIO_PMON_RAW_EVENT_MASK,
+       .event_mask_ext         = SKX_IIO_PMON_RAW_EVENT_MASK_EXT,
+       .box_ctl                = SKX_IIO0_MSR_PMON_BOX_CTL,
+       .msr_offset             = SKX_IIO_MSR_OFFSET,
+       .constraints            = skx_uncore_iio_constraints,
+       .ops                    = &skx_uncore_iio_ops,
+       .format_group           = &skx_uncore_iio_format_group,
+};
+
+static struct attribute *skx_uncore_formats_attr[] = {
+       &format_attr_event.attr,
+       &format_attr_umask.attr,
+       &format_attr_edge.attr,
+       &format_attr_inv.attr,
+       &format_attr_thresh8.attr,
+       NULL,
+};
+
+static struct attribute_group skx_uncore_format_group = {
+       .name = "format",
+       .attrs = skx_uncore_formats_attr,
+};
+
+static struct intel_uncore_type skx_uncore_irp = {
+       .name                   = "irp",
+       .num_counters           = 2,
+       .num_boxes              = 5,
+       .perf_ctr_bits          = 48,
+       .event_ctl              = SKX_IRP0_MSR_PMON_CTL0,
+       .perf_ctr               = SKX_IRP0_MSR_PMON_CTR0,
+       .event_mask             = SNBEP_PMON_RAW_EVENT_MASK,
+       .box_ctl                = SKX_IRP0_MSR_PMON_BOX_CTL,
+       .msr_offset             = SKX_IRP_MSR_OFFSET,
+       .ops                    = &skx_uncore_iio_ops,
+       .format_group           = &skx_uncore_format_group,
+};
+
+static struct intel_uncore_ops skx_uncore_pcu_ops = {
+       IVBEP_UNCORE_MSR_OPS_COMMON_INIT(),
+       .hw_config              = hswep_pcu_hw_config,
+       .get_constraint         = snbep_pcu_get_constraint,
+       .put_constraint         = snbep_pcu_put_constraint,
+};
+
+static struct intel_uncore_type skx_uncore_pcu = {
+       .name                   = "pcu",
+       .num_counters           = 4,
+       .num_boxes              = 1,
+       .perf_ctr_bits          = 48,
+       .perf_ctr               = HSWEP_PCU_MSR_PMON_CTR0,
+       .event_ctl              = HSWEP_PCU_MSR_PMON_CTL0,
+       .event_mask             = SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK,
+       .box_ctl                = HSWEP_PCU_MSR_PMON_BOX_CTL,
+       .num_shared_regs        = 1,
+       .ops                    = &skx_uncore_pcu_ops,
+       .format_group           = &snbep_uncore_pcu_format_group,
+};
+
+static struct intel_uncore_type *skx_msr_uncores[] = {
+       &skx_uncore_ubox,
+       &skx_uncore_chabox,
+       &skx_uncore_iio,
+       &skx_uncore_irp,
+       &skx_uncore_pcu,
+       NULL,
+};
+
+static int skx_count_chabox(void)
+{
+       struct pci_dev *chabox_dev = NULL;
+       int bus, count = 0;
+
+       while (1) {
+               chabox_dev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x208d, chabox_dev);
+               if (!chabox_dev)
+                       break;
+               if (count == 0)
+                       bus = chabox_dev->bus->number;
+               if (bus != chabox_dev->bus->number)
+                       break;
+               count++;
+       }
+
+       pci_dev_put(chabox_dev);
+       return count;
+}
+
+void skx_uncore_cpu_init(void)
+{
+       skx_uncore_chabox.num_boxes = skx_count_chabox();
+       uncore_msr_uncores = skx_msr_uncores;
+}
+
+static struct intel_uncore_type skx_uncore_imc = {
+       .name           = "imc",
+       .num_counters   = 4,
+       .num_boxes      = 6,
+       .perf_ctr_bits  = 48,
+       .fixed_ctr_bits = 48,
+       .fixed_ctr      = SNBEP_MC_CHy_PCI_PMON_FIXED_CTR,
+       .fixed_ctl      = SNBEP_MC_CHy_PCI_PMON_FIXED_CTL,
+       .event_descs    = hswep_uncore_imc_events,
+       .perf_ctr       = SNBEP_PCI_PMON_CTR0,
+       .event_ctl      = SNBEP_PCI_PMON_CTL0,
+       .event_mask     = SNBEP_PMON_RAW_EVENT_MASK,
+       .box_ctl        = SNBEP_PCI_PMON_BOX_CTL,
+       .ops            = &ivbep_uncore_pci_ops,
+       .format_group   = &skx_uncore_format_group,
+};
+
+static struct attribute *skx_upi_uncore_formats_attr[] = {
+       &format_attr_event_ext.attr,
+       &format_attr_umask_ext.attr,
+       &format_attr_edge.attr,
+       &format_attr_inv.attr,
+       &format_attr_thresh8.attr,
+       NULL,
+};
+
+static struct attribute_group skx_upi_uncore_format_group = {
+       .name = "format",
+       .attrs = skx_upi_uncore_formats_attr,
+};
+
+static void skx_upi_uncore_pci_init_box(struct intel_uncore_box *box)
+{
+       struct pci_dev *pdev = box->pci_dev;
+
+       __set_bit(UNCORE_BOX_FLAG_CTL_OFFS8, &box->flags);
+       pci_write_config_dword(pdev, SKX_UPI_PCI_PMON_BOX_CTL, IVBEP_PMON_BOX_CTL_INT);
+}
+
+static struct intel_uncore_ops skx_upi_uncore_pci_ops = {
+       .init_box       = skx_upi_uncore_pci_init_box,
+       .disable_box    = snbep_uncore_pci_disable_box,
+       .enable_box     = snbep_uncore_pci_enable_box,
+       .disable_event  = snbep_uncore_pci_disable_event,
+       .enable_event   = snbep_uncore_pci_enable_event,
+       .read_counter   = snbep_uncore_pci_read_counter,
+};
+
+static struct intel_uncore_type skx_uncore_upi = {
+       .name           = "upi",
+       .num_counters   = 4,
+       .num_boxes      = 3,
+       .perf_ctr_bits  = 48,
+       .perf_ctr       = SKX_UPI_PCI_PMON_CTR0,
+       .event_ctl      = SKX_UPI_PCI_PMON_CTL0,
+       .event_mask     = SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK,
+       .event_mask_ext = SKX_PMON_CTL_UMASK_EXT,
+       .box_ctl        = SKX_UPI_PCI_PMON_BOX_CTL,
+       .ops            = &skx_upi_uncore_pci_ops,
+       .format_group   = &skx_upi_uncore_format_group,
+};
+
+static void skx_m2m_uncore_pci_init_box(struct intel_uncore_box *box)
+{
+       struct pci_dev *pdev = box->pci_dev;
+
+       __set_bit(UNCORE_BOX_FLAG_CTL_OFFS8, &box->flags);
+       pci_write_config_dword(pdev, SKX_M2M_PCI_PMON_BOX_CTL, IVBEP_PMON_BOX_CTL_INT);
+}
+
+static struct intel_uncore_ops skx_m2m_uncore_pci_ops = {
+       .init_box       = skx_m2m_uncore_pci_init_box,
+       .disable_box    = snbep_uncore_pci_disable_box,
+       .enable_box     = snbep_uncore_pci_enable_box,
+       .disable_event  = snbep_uncore_pci_disable_event,
+       .enable_event   = snbep_uncore_pci_enable_event,
+       .read_counter   = snbep_uncore_pci_read_counter,
+};
+
+static struct intel_uncore_type skx_uncore_m2m = {
+       .name           = "m2m",
+       .num_counters   = 4,
+       .num_boxes      = 2,
+       .perf_ctr_bits  = 48,
+       .perf_ctr       = SKX_M2M_PCI_PMON_CTR0,
+       .event_ctl      = SKX_M2M_PCI_PMON_CTL0,
+       .event_mask     = SNBEP_PMON_RAW_EVENT_MASK,
+       .box_ctl        = SKX_M2M_PCI_PMON_BOX_CTL,
+       .ops            = &skx_m2m_uncore_pci_ops,
+       .format_group   = &skx_uncore_format_group,
+};
+
+static struct event_constraint skx_uncore_m2pcie_constraints[] = {
+       UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
+       EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type skx_uncore_m2pcie = {
+       .name           = "m2pcie",
+       .num_counters   = 4,
+       .num_boxes      = 4,
+       .perf_ctr_bits  = 48,
+       .constraints    = skx_uncore_m2pcie_constraints,
+       .perf_ctr       = SNBEP_PCI_PMON_CTR0,
+       .event_ctl      = SNBEP_PCI_PMON_CTL0,
+       .event_mask     = SNBEP_PMON_RAW_EVENT_MASK,
+       .box_ctl        = SNBEP_PCI_PMON_BOX_CTL,
+       .ops            = &ivbep_uncore_pci_ops,
+       .format_group   = &skx_uncore_format_group,
+};
+
+static struct event_constraint skx_uncore_m3upi_constraints[] = {
+       UNCORE_EVENT_CONSTRAINT(0x1d, 0x1),
+       UNCORE_EVENT_CONSTRAINT(0x1e, 0x1),
+       UNCORE_EVENT_CONSTRAINT(0x40, 0x7),
+       UNCORE_EVENT_CONSTRAINT(0x4e, 0x7),
+       UNCORE_EVENT_CONSTRAINT(0x4f, 0x7),
+       UNCORE_EVENT_CONSTRAINT(0x50, 0x7),
+       UNCORE_EVENT_CONSTRAINT(0x51, 0x7),
+       UNCORE_EVENT_CONSTRAINT(0x52, 0x7),
+       EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type skx_uncore_m3upi = {
+       .name           = "m3upi",
+       .num_counters   = 3,
+       .num_boxes      = 3,
+       .perf_ctr_bits  = 48,
+       .constraints    = skx_uncore_m3upi_constraints,
+       .perf_ctr       = SNBEP_PCI_PMON_CTR0,
+       .event_ctl      = SNBEP_PCI_PMON_CTL0,
+       .event_mask     = SNBEP_PMON_RAW_EVENT_MASK,
+       .box_ctl        = SNBEP_PCI_PMON_BOX_CTL,
+       .ops            = &ivbep_uncore_pci_ops,
+       .format_group   = &skx_uncore_format_group,
+};
+
+enum {
+       SKX_PCI_UNCORE_IMC,
+       SKX_PCI_UNCORE_M2M,
+       SKX_PCI_UNCORE_UPI,
+       SKX_PCI_UNCORE_M2PCIE,
+       SKX_PCI_UNCORE_M3UPI,
+};
+
+static struct intel_uncore_type *skx_pci_uncores[] = {
+       [SKX_PCI_UNCORE_IMC]    = &skx_uncore_imc,
+       [SKX_PCI_UNCORE_M2M]    = &skx_uncore_m2m,
+       [SKX_PCI_UNCORE_UPI]    = &skx_uncore_upi,
+       [SKX_PCI_UNCORE_M2PCIE] = &skx_uncore_m2pcie,
+       [SKX_PCI_UNCORE_M3UPI]  = &skx_uncore_m3upi,
+       NULL,
+};
+
+static const struct pci_device_id skx_uncore_pci_ids[] = {
+       { /* MC0 Channel 0 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2042),
+               .driver_data = UNCORE_PCI_DEV_FULL_DATA(10, 2, SKX_PCI_UNCORE_IMC, 0),
+       },
+       { /* MC0 Channel 1 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2046),
+               .driver_data = UNCORE_PCI_DEV_FULL_DATA(10, 6, SKX_PCI_UNCORE_IMC, 1),
+       },
+       { /* MC0 Channel 2 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x204a),
+               .driver_data = UNCORE_PCI_DEV_FULL_DATA(11, 2, SKX_PCI_UNCORE_IMC, 2),
+       },
+       { /* MC1 Channel 0 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2042),
+               .driver_data = UNCORE_PCI_DEV_FULL_DATA(12, 2, SKX_PCI_UNCORE_IMC, 3),
+       },
+       { /* MC1 Channel 1 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2046),
+               .driver_data = UNCORE_PCI_DEV_FULL_DATA(12, 6, SKX_PCI_UNCORE_IMC, 4),
+       },
+       { /* MC1 Channel 2 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x204a),
+               .driver_data = UNCORE_PCI_DEV_FULL_DATA(13, 2, SKX_PCI_UNCORE_IMC, 5),
+       },
+       { /* M2M0 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2066),
+               .driver_data = UNCORE_PCI_DEV_FULL_DATA(8, 0, SKX_PCI_UNCORE_M2M, 0),
+       },
+       { /* M2M1 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2066),
+               .driver_data = UNCORE_PCI_DEV_FULL_DATA(9, 0, SKX_PCI_UNCORE_M2M, 1),
+       },
+       { /* UPI0 Link 0 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2058),
+               .driver_data = UNCORE_PCI_DEV_FULL_DATA(14, 0, SKX_PCI_UNCORE_UPI, 0),
+       },
+       { /* UPI0 Link 1 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2058),
+               .driver_data = UNCORE_PCI_DEV_FULL_DATA(15, 0, SKX_PCI_UNCORE_UPI, 1),
+       },
+       { /* UPI1 Link 2 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2058),
+               .driver_data = UNCORE_PCI_DEV_FULL_DATA(16, 0, SKX_PCI_UNCORE_UPI, 2),
+       },
+       { /* M2PCIe 0 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2088),
+               .driver_data = UNCORE_PCI_DEV_FULL_DATA(21, 1, SKX_PCI_UNCORE_M2PCIE, 0),
+       },
+       { /* M2PCIe 1 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2088),
+               .driver_data = UNCORE_PCI_DEV_FULL_DATA(22, 1, SKX_PCI_UNCORE_M2PCIE, 1),
+       },
+       { /* M2PCIe 2 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2088),
+               .driver_data = UNCORE_PCI_DEV_FULL_DATA(23, 1, SKX_PCI_UNCORE_M2PCIE, 2),
+       },
+       { /* M2PCIe 3 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2088),
+               .driver_data = UNCORE_PCI_DEV_FULL_DATA(21, 5, SKX_PCI_UNCORE_M2PCIE, 3),
+       },
+       { /* M3UPI0 Link 0 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x204C),
+               .driver_data = UNCORE_PCI_DEV_FULL_DATA(18, 0, SKX_PCI_UNCORE_M3UPI, 0),
+       },
+       { /* M3UPI0 Link 1 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x204D),
+               .driver_data = UNCORE_PCI_DEV_FULL_DATA(18, 1, SKX_PCI_UNCORE_M3UPI, 1),
+       },
+       { /* M3UPI1 Link 2 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x204C),
+               .driver_data = UNCORE_PCI_DEV_FULL_DATA(18, 4, SKX_PCI_UNCORE_M3UPI, 2),
+       },
+       { /* end: all zeroes */ }
+};
+
+
+static struct pci_driver skx_uncore_pci_driver = {
+       .name           = "skx_uncore",
+       .id_table       = skx_uncore_pci_ids,
+};
+
+int skx_uncore_pci_init(void)
+{
+       /* need to double check pci address */
+       int ret = snbep_pci2phy_map_init(0x2014, SKX_CPUNODEID, SKX_GIDNIDMAP, false);
+
+       if (ret)
+               return ret;
+
+       uncore_pci_uncores = skx_pci_uncores;
+       uncore_pci_driver = &skx_uncore_pci_driver;
+       return 0;
+}
+
+/* end of SKX uncore support */
index 8c4a47706296ab49a2c3825c6312e207e608bdeb..5874d8de1f8da111e3e0a682835e42b23af875ff 100644 (file)
@@ -194,12 +194,13 @@ struct cpu_hw_events {
         */
        struct debug_store      *ds;
        u64                     pebs_enabled;
+       int                     n_pebs;
+       int                     n_large_pebs;
 
        /*
         * Intel LBR bits
         */
        int                             lbr_users;
-       void                            *lbr_context;
        struct perf_branch_stack        lbr_stack;
        struct perf_branch_entry        lbr_entries[MAX_LBR_ENTRIES];
        struct er_account               *lbr_sel;
@@ -508,6 +509,8 @@ struct x86_pmu {
        void            (*enable_all)(int added);
        void            (*enable)(struct perf_event *);
        void            (*disable)(struct perf_event *);
+       void            (*add)(struct perf_event *);
+       void            (*del)(struct perf_event *);
        int             (*hw_config)(struct perf_event *event);
        int             (*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign);
        unsigned        eventsel;
@@ -888,6 +891,10 @@ extern struct event_constraint intel_skl_pebs_event_constraints[];
 
 struct event_constraint *intel_pebs_constraints(struct perf_event *event);
 
+void intel_pmu_pebs_add(struct perf_event *event);
+
+void intel_pmu_pebs_del(struct perf_event *event);
+
 void intel_pmu_pebs_enable(struct perf_event *event);
 
 void intel_pmu_pebs_disable(struct perf_event *event);
@@ -906,9 +913,9 @@ u64 lbr_from_signext_quirk_wr(u64 val);
 
 void intel_pmu_lbr_reset(void);
 
-void intel_pmu_lbr_enable(struct perf_event *event);
+void intel_pmu_lbr_add(struct perf_event *event);
 
-void intel_pmu_lbr_disable(struct perf_event *event);
+void intel_pmu_lbr_del(struct perf_event *event);
 
 void intel_pmu_lbr_enable_all(bool pmi);
 
index 92a8308b96f64cb6ce845a8379ca06cb9a6a00d6..1188bc849ee3b3253fd8229fca21bf2d6c87856e 100644 (file)
 #define X86_FEATURE_APERFMPERF ( 3*32+28) /* APERFMPERF */
 #define X86_FEATURE_EAGER_FPU  ( 3*32+29) /* "eagerfpu" Non lazy FPU restore */
 #define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */
-#define X86_FEATURE_MCE_RECOVERY ( 3*32+31) /* cpu has recoverable machine checks */
 
 /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
 #define X86_FEATURE_XMM3       ( 4*32+ 0) /* "pni" SSE-3 */
index 4e10d73cf01844edc201ed23f51d28a72a537521..12080d87da3b189f421d421c5d6a3cca5dc83014 100644 (file)
@@ -36,7 +36,7 @@ static inline void fill_ldt(struct desc_struct *desc, const struct user_desc *in
 
 extern struct desc_ptr idt_descr;
 extern gate_desc idt_table[];
-extern struct desc_ptr debug_idt_descr;
+extern const struct desc_ptr debug_idt_descr;
 extern gate_desc debug_idt_table[];
 
 struct gdt_page {
index ae55a43e09c0f20846919680f85a1c15538432bc..d4957ac72b488245880a8fb7c5885db558cd66c7 100644 (file)
@@ -45,7 +45,8 @@
 extern u64 xfeatures_mask;
 extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS];
 
-extern void update_regset_xstate_info(unsigned int size, u64 xstate_mask);
+extern void __init update_regset_xstate_info(unsigned int size,
+                                            u64 xstate_mask);
 
 void fpu__xstate_clear_all_cpu_caps(void);
 void *get_xsave_addr(struct xregs_state *xsave, int xstate);
index a4820d4df617daede213516eea42becd5b874640..eccd0ac6bc38857904565619857c68a7f8a494e6 100644 (file)
@@ -6,6 +6,7 @@
 # define MCOUNT_ADDR           ((unsigned long)(__fentry__))
 #else
 # define MCOUNT_ADDR           ((unsigned long)(mcount))
+# define HAVE_FUNCTION_GRAPH_FP_TEST
 #endif
 #define MCOUNT_INSN_SIZE       5 /* sizeof mcount call */
 
@@ -13,6 +14,8 @@
 #define ARCH_SUPPORTS_FTRACE_OPS 1
 #endif
 
+#define HAVE_FUNCTION_GRAPH_RET_ADDR_PTR
+
 #ifndef __ASSEMBLY__
 extern void mcount(void);
 extern atomic_t modifying_ftrace_code;
index 055ea9941dd5f671d116306a6544adf7a4f189bc..67942b6ad4b7c6f2ced9eac4779eeea72b3472b0 100644 (file)
@@ -43,6 +43,9 @@ struct hypervisor_x86 {
 
        /* X2APIC detection (run once per boot) */
        bool            (*x2apic_available)(void);
+
+       /* pin current vcpu to specified physical cpu (run rarely) */
+       void            (*pin_vcpu)(int);
 };
 
 extern const struct hypervisor_x86 *x86_hyper;
@@ -56,6 +59,7 @@ extern const struct hypervisor_x86 x86_hyper_kvm;
 extern void init_hypervisor(struct cpuinfo_x86 *c);
 extern void init_hypervisor_platform(void);
 extern bool hypervisor_x2apic_available(void);
+extern void hypervisor_pin_vcpu(int cpu);
 #else
 static inline void init_hypervisor(struct cpuinfo_x86 *c) { }
 static inline void init_hypervisor_platform(void) { }
index 627719475457d00921d7ed80a5527c36aecc12ed..9ae5ab80a497c35a9fa3048c532bdb9239c395e1 100644 (file)
@@ -56,8 +56,8 @@
 #define INTEL_FAM6_ATOM_SILVERMONT1    0x37 /* BayTrail/BYT / Valleyview */
 #define INTEL_FAM6_ATOM_SILVERMONT2    0x4D /* Avaton/Rangely */
 #define INTEL_FAM6_ATOM_AIRMONT                0x4C /* CherryTrail / Braswell */
-#define INTEL_FAM6_ATOM_MERRIFIELD1    0x4A /* Tangier */
-#define INTEL_FAM6_ATOM_MERRIFIELD2    0x5A /* Annidale */
+#define INTEL_FAM6_ATOM_MERRIFIELD     0x4A /* Tangier */
+#define INTEL_FAM6_ATOM_MOOREFIELD     0x5A /* Annidale */
 #define INTEL_FAM6_ATOM_GOLDMONT       0x5C
 #define INTEL_FAM6_ATOM_DENVERTON      0x5F /* Goldmont Microserver */
 
index 9d6b097aa73dfb4242e13614846bf3948c41066f..5b6753d1f7f4e35f8066e89555c715badc46ca23 100644 (file)
@@ -18,6 +18,8 @@
 extern int intel_mid_pci_init(void);
 extern int intel_mid_pci_set_power_state(struct pci_dev *pdev, pci_power_t state);
 
+extern void intel_mid_pwr_power_off(void);
+
 #define INTEL_MID_PWR_LSS_OFFSET       4
 #define INTEL_MID_PWR_LSS_TYPE         (1 << 7)
 
index 925b605eb5c601fa9a6f6c24cf41e596b92d86f8..4fb1d0abef953dbf97cb9e002d3b74d88bc40483 100644 (file)
@@ -3,6 +3,8 @@
 
 #include <linux/notifier.h>
 
+#define IPCMSG_COLD_OFF                0x80    /* Only for Tangier */
+
 #define IPCMSG_WARM_RESET      0xF0
 #define IPCMSG_COLD_RESET      0xF1
 #define IPCMSG_SOFT_RESET      0xF2
index 2674ee3de7486afde5a37ffc8309d9aac51a2e9b..1052a797d71d0b488eb6973b1ec512005afbe757 100644 (file)
@@ -6,6 +6,7 @@ unsigned long kaslr_get_random_long(const char *purpose);
 #ifdef CONFIG_RANDOMIZE_MEMORY
 extern unsigned long page_offset_base;
 extern unsigned long vmalloc_base;
+extern unsigned long vmemmap_base;
 
 void kernel_randomize_memory(void);
 #else
index 1ef9d581b5d9829365160082acbb98d1f549d421..d318811884318aa8691cac64a7e34384446ec17e 100644 (file)
@@ -24,8 +24,6 @@ enum die_val {
 extern void printk_address(unsigned long address);
 extern void die(const char *, struct pt_regs *,long);
 extern int __must_check __die(const char *, struct pt_regs *, long);
-extern void show_trace(struct task_struct *t, struct pt_regs *regs,
-                      unsigned long *sp, unsigned long bp);
 extern void show_stack_regs(struct pt_regs *regs);
 extern void __show_regs(struct pt_regs *regs, int all);
 extern unsigned long oops_begin(void);
index 1ea0baef1175c407e9097e125a84b226be28266c..72198c64e646d3cbb0d9831d950e4e815003cdd3 100644 (file)
@@ -23,6 +23,14 @@ typedef struct {
        const struct vdso_image *vdso_image;    /* vdso image in use */
 
        atomic_t perf_rdpmc_allowed;    /* nonzero if rdpmc is allowed */
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
+       /*
+        * One bit per protection key says whether userspace can
+        * use it or not.  protected by mmap_sem.
+        */
+       u16 pkey_allocation_map;
+       s16 execute_only_pkey;
+#endif
 } mm_context_t;
 
 #ifdef CONFIG_SMP
index d8abfcf524d102573a53e456fec24592cf933a01..8e0a9fe86de466eee5a520aa8ef9776a0e0e637d 100644 (file)
@@ -4,6 +4,7 @@
 #include <asm/desc.h>
 #include <linux/atomic.h>
 #include <linux/mm_types.h>
+#include <linux/pkeys.h>
 
 #include <trace/events/tlb.h>
 
@@ -107,7 +108,16 @@ static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
 static inline int init_new_context(struct task_struct *tsk,
                                   struct mm_struct *mm)
 {
+       #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
+       if (cpu_feature_enabled(X86_FEATURE_OSPKE)) {
+               /* pkey 0 is the default and always allocated */
+               mm->context.pkey_allocation_map = 0x1;
+               /* -1 means unallocated or invalid */
+               mm->context.execute_only_pkey = -1;
+       }
+       #endif
        init_new_context_ldt(tsk, mm);
+
        return 0;
 }
 static inline void destroy_context(struct mm_struct *mm)
@@ -195,16 +205,20 @@ static inline void arch_unmap(struct mm_struct *mm, struct vm_area_struct *vma,
                mpx_notify_unmap(mm, vma, start, end);
 }
 
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
 static inline int vma_pkey(struct vm_area_struct *vma)
 {
-       u16 pkey = 0;
-#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
        unsigned long vma_pkey_mask = VM_PKEY_BIT0 | VM_PKEY_BIT1 |
                                      VM_PKEY_BIT2 | VM_PKEY_BIT3;
-       pkey = (vma->vm_flags & vma_pkey_mask) >> VM_PKEY_SHIFT;
-#endif
-       return pkey;
+
+       return (vma->vm_flags & vma_pkey_mask) >> VM_PKEY_SHIFT;
+}
+#else
+static inline int vma_pkey(struct vm_area_struct *vma)
+{
+       return 0;
 }
+#endif
 
 static inline bool __pkru_allows_pkey(u16 pkey, bool write)
 {
@@ -258,5 +272,4 @@ static inline bool arch_pte_access_permitted(pte_t pte, bool write)
 {
        return __pkru_allows_pkey(pte_flags_pkey(pte_flags(pte)), write);
 }
-
 #endif /* _ASM_X86_MMU_CONTEXT_H */
index 6fdef9eef2d51d58aad4ff2f2ef9248585093d73..3a264200c62fd2067c387ffaecb27309727ff1de 100644 (file)
@@ -57,11 +57,13 @@ typedef struct { pteval_t pte; } pte_t;
 #define MAXMEM         _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL)
 #define VMALLOC_SIZE_TB        _AC(32, UL)
 #define __VMALLOC_BASE _AC(0xffffc90000000000, UL)
-#define VMEMMAP_START  _AC(0xffffea0000000000, UL)
+#define __VMEMMAP_BASE _AC(0xffffea0000000000, UL)
 #ifdef CONFIG_RANDOMIZE_MEMORY
 #define VMALLOC_START  vmalloc_base
+#define VMEMMAP_START  vmemmap_base
 #else
 #define VMALLOC_START  __VMALLOC_BASE
+#define VMEMMAP_START  __VMEMMAP_BASE
 #endif /* CONFIG_RANDOMIZE_MEMORY */
 #define VMALLOC_END    (VMALLOC_START + _AC((VMALLOC_SIZE_TB << 40) - 1, UL))
 #define MODULES_VADDR    (__START_KERNEL_map + KERNEL_IMAGE_SIZE)
index 7b84565c916c7b0329ac92c79c789748ff9eeb28..34684adb6899ad132e44e159be621d92370b99c0 100644 (file)
@@ -10,7 +10,6 @@ extern int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
  * Try to dedicate one of the protection keys to be used as an
  * execute-only protection key.
  */
-#define PKEY_DEDICATED_EXECUTE_ONLY 15
 extern int __execute_only_pkey(struct mm_struct *mm);
 static inline int execute_only_pkey(struct mm_struct *mm)
 {
@@ -31,4 +30,76 @@ static inline int arch_override_mprotect_pkey(struct vm_area_struct *vma,
        return __arch_override_mprotect_pkey(vma, prot, pkey);
 }
 
+extern int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
+               unsigned long init_val);
+
+#define ARCH_VM_PKEY_FLAGS (VM_PKEY_BIT0 | VM_PKEY_BIT1 | VM_PKEY_BIT2 | VM_PKEY_BIT3)
+
+#define mm_pkey_allocation_map(mm)     (mm->context.pkey_allocation_map)
+#define mm_set_pkey_allocated(mm, pkey) do {           \
+       mm_pkey_allocation_map(mm) |= (1U << pkey);     \
+} while (0)
+#define mm_set_pkey_free(mm, pkey) do {                        \
+       mm_pkey_allocation_map(mm) &= ~(1U << pkey);    \
+} while (0)
+
+static inline
+bool mm_pkey_is_allocated(struct mm_struct *mm, int pkey)
+{
+       return mm_pkey_allocation_map(mm) & (1U << pkey);
+}
+
+/*
+ * Returns a positive, 4-bit key on success, or -1 on failure.
+ */
+static inline
+int mm_pkey_alloc(struct mm_struct *mm)
+{
+       /*
+        * Note: this is the one and only place we make sure
+        * that the pkey is valid as far as the hardware is
+        * concerned.  The rest of the kernel trusts that
+        * only good, valid pkeys come out of here.
+        */
+       u16 all_pkeys_mask = ((1U << arch_max_pkey()) - 1);
+       int ret;
+
+       /*
+        * Are we out of pkeys?  We must handle this specially
+        * because ffz() behavior is undefined if there are no
+        * zeros.
+        */
+       if (mm_pkey_allocation_map(mm) == all_pkeys_mask)
+               return -1;
+
+       ret = ffz(mm_pkey_allocation_map(mm));
+
+       mm_set_pkey_allocated(mm, ret);
+
+       return ret;
+}
+
+static inline
+int mm_pkey_free(struct mm_struct *mm, int pkey)
+{
+       /*
+        * pkey 0 is special, always allocated and can never
+        * be freed.
+        */
+       if (!pkey)
+               return -EINVAL;
+       if (!mm_pkey_is_allocated(mm, pkey))
+               return -EINVAL;
+
+       mm_set_pkey_free(mm, pkey);
+
+       return 0;
+}
+
+extern int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
+               unsigned long init_val);
+extern int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
+               unsigned long init_val);
+extern void copy_init_pkru_to_fpregs(void);
+
 #endif /*_ASM_X86_PKEYS_H */
index 643eba42d6206aa0fbcb57baa150606269577523..2c1ebeb4d7376db6350b7266048a1d37d800ac86 100644 (file)
@@ -46,10 +46,7 @@ static inline void arch_memcpy_to_pmem(void *dst, const void *src, size_t n)
 
 static inline int arch_memcpy_from_pmem(void *dst, const void *src, size_t n)
 {
-       if (static_cpu_has(X86_FEATURE_MCE_RECOVERY))
-               return memcpy_mcsafe(dst, src, n);
-       memcpy(dst, src, n);
-       return 0;
+       return memcpy_mcsafe(dst, src, n);
 }
 
 /**
index 63def9537a2d249f5814cf73fac20f7d8873e232..b22fb5a4ff3cf1a70796bbac6a367e7a6fda55f5 100644 (file)
@@ -389,9 +389,6 @@ struct thread_struct {
        unsigned short          fsindex;
        unsigned short          gsindex;
 #endif
-#ifdef CONFIG_X86_32
-       unsigned long           ip;
-#endif
 #ifdef CONFIG_X86_64
        unsigned long           fsbase;
        unsigned long           gsbase;
@@ -724,8 +721,6 @@ static inline void spin_lock_prefetch(const void *x)
        .addr_limit             = KERNEL_DS,                              \
 }
 
-extern unsigned long thread_saved_pc(struct task_struct *tsk);
-
 /*
  * TOP_OF_KERNEL_STACK_PADDING reserves 8 bytes on top of the ring0 stack.
  * This is necessary to guarantee that the entire "struct pt_regs"
@@ -776,17 +771,13 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk);
        .addr_limit             = KERNEL_DS,                    \
 }
 
-/*
- * Return saved PC of a blocked thread.
- * What is this good for? it will be always the scheduler or ret_from_fork.
- */
-#define thread_saved_pc(t)     READ_ONCE_NOCHECK(*(unsigned long *)((t)->thread.sp - 8))
-
 #define task_pt_regs(tsk)      ((struct pt_regs *)(tsk)->thread.sp0 - 1)
 extern unsigned long KSTK_ESP(struct task_struct *task);
 
 #endif /* CONFIG_X86_64 */
 
+extern unsigned long thread_saved_pc(struct task_struct *tsk);
+
 extern void start_thread(struct pt_regs *regs, unsigned long new_ip,
                                               unsigned long new_sp);
 
index b2988c0ed82961c043b682d44b829c6db00a1e46..230e1903acf07faa831c8d2496a4457aaae9b5ea 100644 (file)
@@ -44,9 +44,9 @@ struct trampoline_header {
 extern struct real_mode_header *real_mode_header;
 extern unsigned char real_mode_blob_end[];
 
-extern unsigned long init_rsp;
 extern unsigned long initial_code;
 extern unsigned long initial_gs;
+extern unsigned long initial_stack;
 
 extern unsigned char real_mode_blob[];
 extern unsigned char real_mode_relocs[];
index ebd0c164cd4e9033ebb42ff119f8cfe05c901f65..19980b36f394b18e6816629390130fa3eb789115 100644 (file)
@@ -39,9 +39,6 @@ DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_bios_cpu_apicid);
 DECLARE_EARLY_PER_CPU_READ_MOSTLY(int, x86_cpu_to_logical_apicid);
 #endif
 
-/* Static state in head.S used to set up a CPU */
-extern unsigned long stack_start; /* Initial stack pointer address */
-
 struct task_struct;
 
 struct smp_ops {
index 0944218af9e279d631af9f03506a9672c4443c6d..3552f5e7189e74b54ada5bee83dcc4dc47a155f0 100644 (file)
@@ -8,6 +8,7 @@
 
 #include <linux/uaccess.h>
 #include <linux/ptrace.h>
+#include <asm/switch_to.h>
 
 extern int kstack_depth_to_print;
 
@@ -49,37 +50,41 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
 
 #ifdef CONFIG_X86_32
 #define STACKSLOTS_PER_LINE 8
-#define get_bp(bp) asm("movl %%ebp, %0" : "=r" (bp) :)
 #else
 #define STACKSLOTS_PER_LINE 4
-#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :)
 #endif
 
 #ifdef CONFIG_FRAME_POINTER
-static inline unsigned long
-stack_frame(struct task_struct *task, struct pt_regs *regs)
+static inline unsigned long *
+get_frame_pointer(struct task_struct *task, struct pt_regs *regs)
 {
-       unsigned long bp;
-
        if (regs)
-               return regs->bp;
+               return (unsigned long *)regs->bp;
 
-       if (task == current) {
-               /* Grab bp right from our regs */
-               get_bp(bp);
-               return bp;
-       }
+       if (!task || task == current)
+               return __builtin_frame_address(0);
 
-       /* bp is the last reg pushed by switch_to */
-       return *(unsigned long *)task->thread.sp;
+       return (unsigned long *)((struct inactive_task_frame *)task->thread.sp)->bp;
 }
 #else
-static inline unsigned long
-stack_frame(struct task_struct *task, struct pt_regs *regs)
+static inline unsigned long *
+get_frame_pointer(struct task_struct *task, struct pt_regs *regs)
 {
-       return 0;
+       return NULL;
+}
+#endif /* CONFIG_FRAME_POINTER */
+
+static inline unsigned long *
+get_stack_pointer(struct task_struct *task, struct pt_regs *regs)
+{
+       if (regs)
+               return (unsigned long *)kernel_stack_pointer(regs);
+
+       if (!task || task == current)
+               return __builtin_frame_address(0);
+
+       return (unsigned long *)task->thread.sp;
 }
-#endif
 
 extern void
 show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
@@ -106,7 +111,7 @@ static inline unsigned long caller_frame_pointer(void)
 {
        struct stack_frame *frame;
 
-       get_bp(frame);
+       frame = __builtin_frame_address(0);
 
 #ifdef CONFIG_FRAME_POINTER
        frame = frame->next_frame;
index 90dbbd9666d43cef657542251b4824d29c0fa7b9..a164862d77e3b4573e7a264c8712fac06ef7a830 100644 (file)
@@ -2,6 +2,7 @@
 #define _ASM_X86_STRING_64_H
 
 #ifdef __KERNEL__
+#include <linux/jump_label.h>
 
 /* Written 2002 by Andi Kleen */
 
@@ -78,6 +79,9 @@ int strcmp(const char *cs, const char *ct);
 #define memset(s, c, n) __memset(s, c, n)
 #endif
 
+__must_check int memcpy_mcsafe_unrolled(void *dst, const void *src, size_t cnt);
+DECLARE_STATIC_KEY_FALSE(mcsafe_key);
+
 /**
  * memcpy_mcsafe - copy memory with indication if a machine check happened
  *
@@ -86,10 +90,23 @@ int strcmp(const char *cs, const char *ct);
  * @cnt:       number of bytes to copy
  *
  * Low level memory copy function that catches machine checks
+ * We only call into the "safe" function on systems that can
+ * actually do machine check recovery. Everyone else can just
+ * use memcpy().
  *
  * Return 0 for success, -EFAULT for fail
  */
-int memcpy_mcsafe(void *dst, const void *src, size_t cnt);
+static __always_inline __must_check int
+memcpy_mcsafe(void *dst, const void *src, size_t cnt)
+{
+#ifdef CONFIG_X86_MCE
+       if (static_branch_unlikely(&mcsafe_key))
+               return memcpy_mcsafe_unrolled(dst, src, cnt);
+       else
+#endif
+               memcpy(dst, src, cnt);
+       return 0;
+}
 
 #endif /* __KERNEL__ */
 
index 8f321a1b03a1aaa0e87c4c1182d2b2f282efa1e4..5cb436acd46315b75ba40aacc780373ea2cc4f7c 100644 (file)
 #define _ASM_X86_SWITCH_TO_H
 
 struct task_struct; /* one of the stranger aspects of C forward declarations */
+
+struct task_struct *__switch_to_asm(struct task_struct *prev,
+                                   struct task_struct *next);
+
 __visible struct task_struct *__switch_to(struct task_struct *prev,
-                                          struct task_struct *next);
+                                         struct task_struct *next);
 struct tss_struct;
 void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
                      struct tss_struct *tss);
 
-#ifdef CONFIG_X86_32
+/* This runs runs on the previous thread's stack. */
+static inline void prepare_switch_to(struct task_struct *prev,
+                                    struct task_struct *next)
+{
+#ifdef CONFIG_VMAP_STACK
+       /*
+        * If we switch to a stack that has a top-level paging entry
+        * that is not present in the current mm, the resulting #PF will
+        * will be promoted to a double-fault and we'll panic.  Probe
+        * the new stack now so that vmalloc_fault can fix up the page
+        * tables if needed.  This can only happen if we use a stack
+        * in vmap space.
+        *
+        * We assume that the stack is aligned so that it never spans
+        * more than one top-level paging entry.
+        *
+        * To minimize cache pollution, just follow the stack pointer.
+        */
+       READ_ONCE(*(unsigned char *)next->thread.sp);
+#endif
+}
+
+asmlinkage void ret_from_fork(void);
+
+/* data that is pointed to by thread.sp */
+struct inactive_task_frame {
+#ifdef CONFIG_X86_64
+       unsigned long r15;
+       unsigned long r14;
+       unsigned long r13;
+       unsigned long r12;
+#else
+       unsigned long si;
+       unsigned long di;
+#endif
+       unsigned long bx;
+       unsigned long bp;
+       unsigned long ret_addr;
+};
 
-#ifdef CONFIG_CC_STACKPROTECTOR
-#define __switch_canary                                                        \
-       "movl %P[task_canary](%[next]), %%ebx\n\t"                      \
-       "movl %%ebx, "__percpu_arg([stack_canary])"\n\t"
-#define __switch_canary_oparam                                         \
-       , [stack_canary] "=m" (stack_canary.canary)
-#define __switch_canary_iparam                                         \
-       , [task_canary] "i" (offsetof(struct task_struct, stack_canary))
-#else  /* CC_STACKPROTECTOR */
-#define __switch_canary
-#define __switch_canary_oparam
-#define __switch_canary_iparam
-#endif /* CC_STACKPROTECTOR */
+struct fork_frame {
+       struct inactive_task_frame frame;
+       struct pt_regs regs;
+};
 
-/*
- * Saving eflags is important. It switches not only IOPL between tasks,
- * it also protects other tasks from NT leaking through sysenter etc.
- */
 #define switch_to(prev, next, last)                                    \
 do {                                                                   \
-       /*                                                              \
-        * Context-switching clobbers all registers, so we clobber      \
-        * them explicitly, via unused output variables.                \
-        * (EAX and EBP is not listed because EBP is saved/restored     \
-        * explicitly for wchan access and EAX is the return value of   \
-        * __switch_to())                                               \
-        */                                                             \
-       unsigned long ebx, ecx, edx, esi, edi;                          \
-                                                                       \
-       asm volatile("pushl %%ebp\n\t"          /* save    EBP   */     \
-                    "movl %%esp,%[prev_sp]\n\t"        /* save    ESP   */ \
-                    "movl %[next_sp],%%esp\n\t"        /* restore ESP   */ \
-                    "movl $1f,%[prev_ip]\n\t"  /* save    EIP   */     \
-                    "pushl %[next_ip]\n\t"     /* restore EIP   */     \
-                    __switch_canary                                    \
-                    "jmp __switch_to\n"        /* regparm call  */     \
-                    "1:\t"                                             \
-                    "popl %%ebp\n\t"           /* restore EBP   */     \
-                                                                       \
-                    /* output parameters */                            \
-                    : [prev_sp] "=m" (prev->thread.sp),                \
-                      [prev_ip] "=m" (prev->thread.ip),                \
-                      "=a" (last),                                     \
-                                                                       \
-                      /* clobbered output registers: */                \
-                      "=b" (ebx), "=c" (ecx), "=d" (edx),              \
-                      "=S" (esi), "=D" (edi)                           \
-                                                                       \
-                      __switch_canary_oparam                           \
-                                                                       \
-                      /* input parameters: */                          \
-                    : [next_sp]  "m" (next->thread.sp),                \
-                      [next_ip]  "m" (next->thread.ip),                \
-                                                                       \
-                      /* regparm parameters for __switch_to(): */      \
-                      [prev]     "a" (prev),                           \
-                      [next]     "d" (next)                            \
+       prepare_switch_to(prev, next);                                  \
                                                                        \
-                      __switch_canary_iparam                           \
-                                                                       \
-                    : /* reloaded segment registers */                 \
-                       "memory");                                      \
+       ((last) = __switch_to_asm((prev), (next)));                     \
 } while (0)
 
-#else /* CONFIG_X86_32 */
-
-/* frame pointer must be last for get_wchan */
-#define SAVE_CONTEXT    "pushq %%rbp ; movq %%rsi,%%rbp\n\t"
-#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp\t"
-
-#define __EXTRA_CLOBBER  \
-       , "rcx", "rbx", "rdx", "r8", "r9", "r10", "r11", \
-         "r12", "r13", "r14", "r15", "flags"
-
-#ifdef CONFIG_CC_STACKPROTECTOR
-#define __switch_canary                                                          \
-       "movq %P[task_canary](%%rsi),%%r8\n\t"                            \
-       "movq %%r8,"__percpu_arg([gs_canary])"\n\t"
-#define __switch_canary_oparam                                           \
-       , [gs_canary] "=m" (irq_stack_union.stack_canary)
-#define __switch_canary_iparam                                           \
-       , [task_canary] "i" (offsetof(struct task_struct, stack_canary))
-#else  /* CC_STACKPROTECTOR */
-#define __switch_canary
-#define __switch_canary_oparam
-#define __switch_canary_iparam
-#endif /* CC_STACKPROTECTOR */
-
-/*
- * There is no need to save or restore flags, because flags are always
- * clean in kernel mode, with the possible exception of IOPL.  Kernel IOPL
- * has no effect.
- */
-#define switch_to(prev, next, last) \
-       asm volatile(SAVE_CONTEXT                                         \
-            "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */       \
-            "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */    \
-            "call __switch_to\n\t"                                       \
-            "movq "__percpu_arg([current_task])",%%rsi\n\t"              \
-            __switch_canary                                              \
-            "movq %P[thread_info](%%rsi),%%r8\n\t"                       \
-            "movq %%rax,%%rdi\n\t"                                       \
-            "testl  %[_tif_fork],%P[ti_flags](%%r8)\n\t"                 \
-            "jnz   ret_from_fork\n\t"                                    \
-            RESTORE_CONTEXT                                              \
-            : "=a" (last)                                                \
-              __switch_canary_oparam                                     \
-            : [next] "S" (next), [prev] "D" (prev),                      \
-              [threadrsp] "i" (offsetof(struct task_struct, thread.sp)), \
-              [ti_flags] "i" (offsetof(struct thread_info, flags)),      \
-              [_tif_fork] "i" (_TIF_FORK),                               \
-              [thread_info] "i" (offsetof(struct task_struct, stack)),   \
-              [current_task] "m" (current_task)                          \
-              __switch_canary_iparam                                     \
-            : "memory", "cc" __EXTRA_CLOBBER)
-
-#endif /* CONFIG_X86_32 */
-
 #endif /* _ASM_X86_SWITCH_TO_H */
index 8b7c8d8e0852cf50d5a03e24f89ae9c693a1891c..494c4b5ada347552922878cd9f102de2f54d4a3b 100644 (file)
@@ -95,7 +95,6 @@ struct thread_info {
 #define TIF_UPROBE             12      /* breakpointed or singlestepping */
 #define TIF_NOTSC              16      /* TSC is not accessible in userland */
 #define TIF_IA32               17      /* IA32 compatibility process */
-#define TIF_FORK               18      /* ret_from_fork */
 #define TIF_NOHZ               19      /* in adaptive nohz mode */
 #define TIF_MEMDIE             20      /* is terminating due to OOM killer */
 #define TIF_POLLING_NRFLAG     21      /* idle is polling for TIF_NEED_RESCHED */
@@ -119,7 +118,6 @@ struct thread_info {
 #define _TIF_UPROBE            (1 << TIF_UPROBE)
 #define _TIF_NOTSC             (1 << TIF_NOTSC)
 #define _TIF_IA32              (1 << TIF_IA32)
-#define _TIF_FORK              (1 << TIF_FORK)
 #define _TIF_NOHZ              (1 << TIF_NOHZ)
 #define _TIF_POLLING_NRFLAG    (1 << TIF_POLLING_NRFLAG)
 #define _TIF_IO_BITMAP         (1 << TIF_IO_BITMAP)
index c3496619740aa01711e8e4fa53b62d2f4074561c..01fd0a7f48cd2f97dac2c1fd20e809e17975ad34 100644 (file)
@@ -117,6 +117,12 @@ extern void ist_exit(struct pt_regs *regs);
 extern void ist_begin_non_atomic(struct pt_regs *regs);
 extern void ist_end_non_atomic(void);
 
+#ifdef CONFIG_VMAP_STACK
+void __noreturn handle_stack_overflow(const char *message,
+                                     struct pt_regs *regs,
+                                     unsigned long fault_address);
+#endif
+
 /* Interrupts/Exceptions */
 enum {
        X86_TRAP_DE = 0,        /*  0, Divide-by-zero */
index 90d84c3eee53a323a775ebd8ab4055f777b81c1d..1ad5fe213043c7b25669e0e7a5aa723ad8d33984 100644 (file)
@@ -282,6 +282,8 @@ acpi_parse_lapic_addr_ovr(struct acpi_subtable_header * header,
        if (BAD_MADT_ENTRY(lapic_addr_ovr, end))
                return -EINVAL;
 
+       acpi_table_print_madt_entry(header);
+
        acpi_lapic_addr = lapic_addr_ovr->address;
 
        return 0;
@@ -998,21 +1000,6 @@ static int __init acpi_parse_madt_lapic_entries(void)
        if (!boot_cpu_has(X86_FEATURE_APIC))
                return -ENODEV;
 
-       /*
-        * Note that the LAPIC address is obtained from the MADT (32-bit value)
-        * and (optionally) overridden by a LAPIC_ADDR_OVR entry (64-bit value).
-        */
-
-       count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_OVERRIDE,
-                                     acpi_parse_lapic_addr_ovr, 0);
-       if (count < 0) {
-               printk(KERN_ERR PREFIX
-                      "Error parsing LAPIC address override entry\n");
-               return count;
-       }
-
-       register_lapic_address(acpi_lapic_addr);
-
        count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_SAPIC,
                                      acpi_parse_sapic, MAX_LOCAL_APIC);
 
@@ -1031,8 +1018,8 @@ static int __init acpi_parse_madt_lapic_entries(void)
                        return ret;
                }
 
-               x2count = madt_proc[0].count;
-               count = madt_proc[1].count;
+               count = madt_proc[0].count;
+               x2count = madt_proc[1].count;
        }
        if (!count && !x2count) {
                printk(KERN_ERR PREFIX "No LAPIC entries present\n");
index adb3eaf8fe2a5e038c2444bd8f75b55716faf607..48587335ede8e2296b80ff991d1bf4e8e155ad46 100644 (file)
@@ -99,7 +99,7 @@ int x86_acpi_suspend_lowlevel(void)
        saved_magic = 0x12345678;
 #else /* CONFIG_64BIT */
 #ifdef CONFIG_SMP
-       stack_start = (unsigned long)temp_stack + sizeof(temp_stack);
+       initial_stack = (unsigned long)temp_stack + sizeof(temp_stack);
        early_gdt_descr.address =
                        (unsigned long)get_cpu_gdt_table(smp_processor_id());
        initial_gs = per_cpu_offset(smp_processor_id());
index 50c95af0f017d7ab73a5cad5dc226b1bf5d9cf60..d165b3540d65e29dd9914f08d4d32c9ed3966883 100644 (file)
@@ -1374,7 +1374,6 @@ void setup_local_APIC(void)
         * Actually disabling the focus CPU check just makes the hang less
         * frequent as it makes the interrupt distributon model be more
         * like LRU than MRU (the short-term load is more even across CPUs).
-        * See also the comment in end_level_ioapic_irq().  --macro
         */
 
        /*
@@ -1828,7 +1827,7 @@ void __init register_lapic_address(unsigned long address)
        if (!x2apic_mode) {
                set_fixmap_nocache(FIX_APIC_BASE, address);
                apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n",
-                           APIC_BASE, mp_lapic_addr);
+                           APIC_BASE, address);
        }
        if (boot_cpu_physical_apicid == -1U) {
                boot_cpu_physical_apicid  = read_apic_id();
@@ -2093,7 +2092,6 @@ int generic_processor_info(int apicid, int version)
                return -EINVAL;
        }
 
-       num_processors++;
        if (apicid == boot_cpu_physical_apicid) {
                /*
                 * x86_bios_cpu_apicid is required to have processors listed
@@ -2116,10 +2114,13 @@ int generic_processor_info(int apicid, int version)
 
                pr_warning("APIC: Package limit reached. Processor %d/0x%x ignored.\n",
                           thiscpu, apicid);
+
                disabled_cpus++;
                return -ENOSPC;
        }
 
+       num_processors++;
+
        /*
         * Validate version
         */
index 5b2ae106bd4ac7c3ae519a6e5ee6c191431ada74..8862da76ef6fef6366daac680891e8fca3a8b9fa 100644 (file)
@@ -25,7 +25,7 @@
 static struct apic apic_physflat;
 static struct apic apic_flat;
 
-struct apic __read_mostly *apic = &apic_flat;
+struct apic *apic __ro_after_init = &apic_flat;
 EXPORT_SYMBOL_GPL(apic);
 
 static int flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
@@ -154,7 +154,7 @@ static int flat_probe(void)
        return 1;
 }
 
-static struct apic apic_flat  {
+static struct apic apic_flat __ro_after_init = {
        .name                           = "flat",
        .probe                          = flat_probe,
        .acpi_madt_oem_check            = flat_acpi_madt_oem_check,
@@ -248,7 +248,7 @@ static int physflat_probe(void)
        return 0;
 }
 
-static struct apic apic_physflat  {
+static struct apic apic_physflat __ro_after_init = {
 
        .name                           = "physical flat",
        .probe                          = physflat_probe,
index c05688b2deff7248473b91d8d9fe953e7395f67e..b109e4389c928d2efcc2df0b83df3c4b7872fe28 100644 (file)
@@ -108,7 +108,7 @@ static void noop_apic_write(u32 reg, u32 v)
        WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_APIC) && !disable_apic);
 }
 
-struct apic apic_noop = {
+struct apic apic_noop __ro_after_init = {
        .name                           = "noop",
        .probe                          = noop_probe,
        .acpi_madt_oem_check            = NULL,
index 06dbaa458bfef4bfd052c5b4e7d72b59ce0410d8..56012010332c84d1c5d6276aa20734661b020ad0 100644 (file)
@@ -142,7 +142,7 @@ static int probe_bigsmp(void)
        return dmi_bigsmp;
 }
 
-static struct apic apic_bigsmp = {
+static struct apic apic_bigsmp __ro_after_init = {
 
        .name                           = "bigsmp",
        .probe                          = probe_bigsmp,
index ade25320df96479193fe850e8747f9acad758b2d..015bbf30e3e325f5a9211a62f40f529f11801fe9 100644 (file)
@@ -269,7 +269,7 @@ static void hpet_msi_write_msg(struct irq_data *data, struct msi_msg *msg)
        hpet_msi_write(irq_data_get_irq_handler_data(data), msg);
 }
 
-static struct irq_chip hpet_msi_controller = {
+static struct irq_chip hpet_msi_controller __ro_after_init = {
        .name = "HPET-MSI",
        .irq_unmask = hpet_msi_unmask,
        .irq_mask = hpet_msi_mask,
index 7c43e716c158de0799760a5537ec05a5bd272a9f..e5fb2f08646097c8ed8d880b97cde89ecb28b83f 100644 (file)
@@ -72,7 +72,7 @@ static int probe_default(void)
        return 1;
 }
 
-static struct apic apic_default = {
+static struct apic apic_default __ro_after_init = {
 
        .name                           = "default",
        .probe                          = probe_default,
@@ -126,7 +126,7 @@ static struct apic apic_default = {
 
 apic_driver(apic_default);
 
-struct apic *apic = &apic_default;
+struct apic *apic __ro_after_init = &apic_default;
 EXPORT_SYMBOL_GPL(apic);
 
 static int cmdline_apic __initdata;
index 54f35d988025b6d867f4915d5d4aeecf9461f352..200af5ae96626e1610c7c946bb0a33c5360f9bcb 100644 (file)
@@ -227,7 +227,7 @@ static void cluster_vector_allocation_domain(int cpu, struct cpumask *retmask,
                cpumask_and(retmask, mask, per_cpu(cpus_in_cluster, cpu));
 }
 
-static struct apic apic_x2apic_cluster = {
+static struct apic apic_x2apic_cluster __ro_after_init = {
 
        .name                           = "cluster x2apic",
        .probe                          = x2apic_cluster_probe,
index 4f13f54f1b1f47ba75fb95cae226e1fe4978f241..ff111f05a3145242e7bd89cb1404e00d5825abff 100644 (file)
@@ -98,7 +98,7 @@ static int x2apic_phys_probe(void)
        return apic == &apic_x2apic_phys;
 }
 
-static struct apic apic_x2apic_phys = {
+static struct apic apic_x2apic_phys __ro_after_init = {
 
        .name                           = "physical x2apic",
        .probe                          = x2apic_phys_probe,
index cb0673c1e940a2fe0e92f04304972f83aad3bacb..b9f6157d42717010821b80b7ee53947208525a7f 100644 (file)
@@ -560,7 +560,7 @@ static int uv_probe(void)
        return apic == &apic_x2apic_uv_x;
 }
 
-static struct apic __refdata apic_x2apic_uv_x = {
+static struct apic apic_x2apic_uv_x __ro_after_init = {
 
        .name                           = "UV large system",
        .probe                          = uv_probe,
index 2bd5c6ff7ee7c3634af74640f993c0104c405b48..db3a0af9b9ec7d7eef938ba78050c5272a8dca5e 100644 (file)
 #endif
 
 void common(void) {
+       BLANK();
+       OFFSET(TASK_threadsp, task_struct, thread.sp);
+#ifdef CONFIG_CC_STACKPROTECTOR
+       OFFSET(TASK_stack_canary, task_struct, stack_canary);
+#endif
+
        BLANK();
        OFFSET(TI_flags, thread_info, flags);
        OFFSET(TI_status, thread_info, status);
index ecdc1d217dc0f50a7b760c25cec8bee586643336..880aa093268df7a0d7db23abde984647880c47cc 100644 (file)
@@ -57,6 +57,11 @@ void foo(void)
        /* Size of SYSENTER_stack */
        DEFINE(SIZEOF_SYSENTER_stack, sizeof(((struct tss_struct *)0)->SYSENTER_stack));
 
+#ifdef CONFIG_CC_STACKPROTECTOR
+       BLANK();
+       OFFSET(stack_canary_offset, stack_canary, canary);
+#endif
+
 #if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE)
        BLANK();
        OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
index d875f97d4e0ba0477c648e1244ff238bfd48be03..210927ee2e74ad8db99ecc691dc67fec5c4dc38e 100644 (file)
@@ -56,6 +56,11 @@ int main(void)
        OFFSET(TSS_sp0, tss_struct, x86_tss.sp0);
        BLANK();
 
+#ifdef CONFIG_CC_STACKPROTECTOR
+       DEFINE(stack_canary_offset, offsetof(union irq_stack_union, stack_canary));
+       BLANK();
+#endif
+
        DEFINE(__NR_syscall_max, sizeof(syscalls_64) - 1);
        DEFINE(NR_syscalls, sizeof(syscalls_64));
 
index 809eda03c527f29779fe94b0ab4922a8a34798bb..06919427d4518b2eaa9b777063512b838b420a25 100644 (file)
@@ -1265,9 +1265,14 @@ static __init int setup_disablecpuid(char *arg)
 __setup("clearcpuid=", setup_disablecpuid);
 
 #ifdef CONFIG_X86_64
-struct desc_ptr idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) idt_table };
-struct desc_ptr debug_idt_descr = { NR_VECTORS * 16 - 1,
-                                   (unsigned long) debug_idt_table };
+struct desc_ptr idt_descr __ro_after_init = {
+       .size = NR_VECTORS * 16 - 1,
+       .address = (unsigned long) idt_table,
+};
+const struct desc_ptr debug_idt_descr = {
+       .size = NR_VECTORS * 16 - 1,
+       .address = (unsigned long) debug_idt_table,
+};
 
 DEFINE_PER_CPU_FIRST(union irq_stack_union,
                     irq_stack_union) __aligned(PAGE_SIZE) __visible;
@@ -1281,7 +1286,7 @@ DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned =
 EXPORT_PER_CPU_SYMBOL(current_task);
 
 DEFINE_PER_CPU(char *, irq_stack_ptr) =
-       init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64;
+       init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE;
 
 DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1;
 
@@ -1305,11 +1310,6 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
 /* May not be marked __init: used by software suspend */
 void syscall_init(void)
 {
-       /*
-        * LSTAR and STAR live in a bit strange symbiosis.
-        * They both write to the same internal register. STAR allows to
-        * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip.
-        */
        wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
        wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
 
index 27e46658ebe3a940512e383e1ec4b73814aca9a4..35691a6b0d325eb71f8c9695d2eecda957d838f3 100644 (file)
@@ -86,3 +86,14 @@ bool __init hypervisor_x2apic_available(void)
               x86_hyper->x2apic_available &&
               x86_hyper->x2apic_available();
 }
+
+void hypervisor_pin_vcpu(int cpu)
+{
+       if (!x86_hyper)
+               return;
+
+       if (x86_hyper->pin_vcpu)
+               x86_hyper->pin_vcpu(cpu);
+       else
+               WARN_ONCE(1, "vcpu pinning requested but not supported!\n");
+}
index 79d8ec849468e70ab42d7f5bfc19497578208a79..7f3f0e147242adf673f89fdbd5c7e6f86697a880 100644 (file)
@@ -41,6 +41,7 @@
 #include <linux/debugfs.h>
 #include <linux/irq_work.h>
 #include <linux/export.h>
+#include <linux/jump_label.h>
 
 #include <asm/processor.h>
 #include <asm/traps.h>
@@ -1633,17 +1634,6 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
 
                if (c->x86 == 6 && c->x86_model == 45)
                        quirk_no_way_out = quirk_sandybridge_ifu;
-               /*
-                * MCG_CAP.MCG_SER_P is necessary but not sufficient to know
-                * whether this processor will actually generate recoverable
-                * machine checks. Check to see if this is an E7 model Xeon.
-                * We can't do a model number check because E5 and E7 use the
-                * same model number. E5 doesn't support recovery, E7 does.
-                */
-               if (mca_cfg.recovery || (mca_cfg.ser &&
-                       !strncmp(c->x86_model_id,
-                                "Intel(R) Xeon(R) CPU E7-", 24)))
-                       set_cpu_cap(c, X86_FEATURE_MCE_RECOVERY);
        }
        if (cfg->monarch_timeout < 0)
                cfg->monarch_timeout = 0;
@@ -2080,6 +2070,7 @@ void mce_disable_bank(int bank)
  * mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
  * mce=nobootlog Don't log MCEs from before booting.
  * mce=bios_cmci_threshold Don't program the CMCI threshold
+ * mce=recovery force enable memcpy_mcsafe()
  */
 static int __init mcheck_enable(char *str)
 {
@@ -2676,8 +2667,14 @@ static int __init mcheck_debugfs_init(void)
 static int __init mcheck_debugfs_init(void) { return -EINVAL; }
 #endif
 
+DEFINE_STATIC_KEY_FALSE(mcsafe_key);
+EXPORT_SYMBOL_GPL(mcsafe_key);
+
 static int __init mcheck_late_init(void)
 {
+       if (mca_cfg.recovery)
+               static_branch_inc(&mcsafe_key);
+
        mcheck_debugfs_init();
 
        /*
index b816971f5da4cd45d1461831283aafb39578f296..620ab06bcf4571c8841b70e35a57657dda2e9985 100644 (file)
@@ -54,6 +54,7 @@ static LIST_HEAD(pcache);
  */
 static u8 *container;
 static size_t container_size;
+static bool ucode_builtin;
 
 static u32 ucode_new_rev;
 static u8 amd_ucode_patch[PATCH_MAX_SIZE];
@@ -281,18 +282,22 @@ static bool __init load_builtin_amd_microcode(struct cpio_data *cp,
 void __init load_ucode_amd_bsp(unsigned int family)
 {
        struct cpio_data cp;
+       bool *builtin;
        void **data;
        size_t *size;
 
 #ifdef CONFIG_X86_32
        data =  (void **)__pa_nodebug(&ucode_cpio.data);
        size = (size_t *)__pa_nodebug(&ucode_cpio.size);
+       builtin = (bool *)__pa_nodebug(&ucode_builtin);
 #else
        data = &ucode_cpio.data;
        size = &ucode_cpio.size;
+       builtin = &ucode_builtin;
 #endif
 
-       if (!load_builtin_amd_microcode(&cp, family))
+       *builtin = load_builtin_amd_microcode(&cp, family);
+       if (!*builtin)
                cp = find_ucode_in_initrd();
 
        if (!(cp.data && cp.size))
@@ -373,7 +378,8 @@ void load_ucode_amd_ap(void)
                return;
 
        /* Add CONFIG_RANDOMIZE_MEMORY offset. */
-       cont += PAGE_OFFSET - __PAGE_OFFSET_BASE;
+       if (!ucode_builtin)
+               cont += PAGE_OFFSET - __PAGE_OFFSET_BASE;
 
        eax = cpuid_eax(0x00000001);
        eq  = (struct equiv_cpu_entry *)(cont + CONTAINER_HDR_SZ);
@@ -439,7 +445,8 @@ int __init save_microcode_in_initrd_amd(void)
                container = cont_va;
 
        /* Add CONFIG_RANDOMIZE_MEMORY offset. */
-       container += PAGE_OFFSET - __PAGE_OFFSET_BASE;
+       if (!ucode_builtin)
+               container += PAGE_OFFSET - __PAGE_OFFSET_BASE;
 
        eax   = cpuid_eax(0x00000001);
        eax   = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
index 28f1b54b7fadce321bec098792e6dc4977429445..24e87e74990df40d6ed8d18dbb51439e7dd4286a 100644 (file)
@@ -72,14 +72,14 @@ static DEFINE_MUTEX(mtrr_mutex);
 u64 size_or_mask, size_and_mask;
 static bool mtrr_aps_delayed_init;
 
-static const struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM];
+static const struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM] __ro_after_init;
 
 const struct mtrr_ops *mtrr_if;
 
 static void set_mtrr(unsigned int reg, unsigned long base,
                     unsigned long size, mtrr_type type);
 
-void set_mtrr_ops(const struct mtrr_ops *ops)
+void __init set_mtrr_ops(const struct mtrr_ops *ops)
 {
        if (ops->vendor && ops->vendor < X86_VENDOR_NUM)
                mtrr_ops[ops->vendor] = ops;
index 6c7ced07d16d1181c6ef21f4f2252ef63019a77b..ad8bd763efa52bd0426f2d756783c00f257b1f2a 100644 (file)
@@ -54,7 +54,7 @@ void fill_mtrr_var_range(unsigned int index,
 bool get_mtrr_state(void);
 void mtrr_bp_pat_init(void);
 
-extern void set_mtrr_ops(const struct mtrr_ops *ops);
+extern void __init set_mtrr_ops(const struct mtrr_ops *ops);
 
 extern u64 size_or_mask, size_and_mask;
 extern const struct mtrr_ops *mtrr_if;
index 92e8f0a7159cadbafba6763081c2b33293be420a..c6c6c39c367f5de2f40194c55098762476e38b25 100644 (file)
@@ -26,10 +26,11 @@ int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE;
 static int die_counter;
 
 static void printk_stack_address(unsigned long address, int reliable,
-               void *data)
+                                char *log_lvl)
 {
+       touch_nmi_watchdog();
        printk("%s [<%p>] %s%pB\n",
-               (char *)data, (void *)address, reliable ? "" : "? ",
+               log_lvl, (void *)address, reliable ? "" : "? ",
                (void *)address);
 }
 
@@ -38,38 +39,6 @@ void printk_address(unsigned long address)
        pr_cont(" [<%p>] %pS\n", (void *)address, (void *)address);
 }
 
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-static void
-print_ftrace_graph_addr(unsigned long addr, void *data,
-                       const struct stacktrace_ops *ops,
-                       struct task_struct *task, int *graph)
-{
-       unsigned long ret_addr;
-       int index;
-
-       if (addr != (unsigned long)return_to_handler)
-               return;
-
-       index = task->curr_ret_stack;
-
-       if (!task->ret_stack || index < *graph)
-               return;
-
-       index -= *graph;
-       ret_addr = task->ret_stack[index].ret;
-
-       ops->address(data, ret_addr, 1);
-
-       (*graph)++;
-}
-#else
-static inline void
-print_ftrace_graph_addr(unsigned long addr, void *data,
-                       const struct stacktrace_ops *ops,
-                       struct task_struct *task, int *graph)
-{ }
-#endif
-
 /*
  * x86-64 can have up to three kernel stacks:
  * process stack
@@ -107,18 +76,33 @@ print_context_stack(struct task_struct *task,
                stack = (unsigned long *)task_stack_page(task);
 
        while (valid_stack_ptr(task, stack, sizeof(*stack), end)) {
-               unsigned long addr;
+               unsigned long addr = *stack;
 
-               addr = *stack;
                if (__kernel_text_address(addr)) {
+                       unsigned long real_addr;
+                       int reliable = 0;
+
                        if ((unsigned long) stack == bp + sizeof(long)) {
-                               ops->address(data, addr, 1);
+                               reliable = 1;
                                frame = frame->next_frame;
                                bp = (unsigned long) frame;
-                       } else {
-                               ops->address(data, addr, 0);
                        }
-                       print_ftrace_graph_addr(addr, data, ops, task, graph);
+
+                       /*
+                        * When function graph tracing is enabled for a
+                        * function, its return address on the stack is
+                        * replaced with the address of an ftrace handler
+                        * (return_to_handler).  In that case, before printing
+                        * the "real" address, we want to print the handler
+                        * address as an "unreliable" hint that function graph
+                        * tracing was involved.
+                        */
+                       real_addr = ftrace_graph_ret_addr(task, graph, addr,
+                                                         stack);
+                       if (real_addr != addr)
+                               ops->address(data, addr, 0);
+
+                       ops->address(data, real_addr, reliable);
                }
                stack++;
        }
@@ -133,19 +117,21 @@ print_context_stack_bp(struct task_struct *task,
                       unsigned long *end, int *graph)
 {
        struct stack_frame *frame = (struct stack_frame *)bp;
-       unsigned long *ret_addr = &frame->return_address;
+       unsigned long *retp = &frame->return_address;
 
-       while (valid_stack_ptr(task, ret_addr, sizeof(*ret_addr), end)) {
-               unsigned long addr = *ret_addr;
+       while (valid_stack_ptr(task, retp, sizeof(*retp), end)) {
+               unsigned long addr = *retp;
+               unsigned long real_addr;
 
                if (!__kernel_text_address(addr))
                        break;
 
-               if (ops->address(data, addr, 1))
+               real_addr = ftrace_graph_ret_addr(task, graph, addr, retp);
+               if (ops->address(data, real_addr, 1))
                        break;
+
                frame = frame->next_frame;
-               ret_addr = &frame->return_address;
-               print_ftrace_graph_addr(addr, data, ops, task, graph);
+               retp = &frame->return_address;
        }
 
        return (unsigned long)frame;
@@ -163,7 +149,6 @@ static int print_trace_stack(void *data, char *name)
  */
 static int print_trace_address(void *data, unsigned long addr, int reliable)
 {
-       touch_nmi_watchdog();
        printk_stack_address(addr, reliable, data);
        return 0;
 }
@@ -182,24 +167,17 @@ show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
        dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
 }
 
-void show_trace(struct task_struct *task, struct pt_regs *regs,
-               unsigned long *stack, unsigned long bp)
-{
-       show_trace_log_lvl(task, regs, stack, bp, "");
-}
-
 void show_stack(struct task_struct *task, unsigned long *sp)
 {
        unsigned long bp = 0;
-       unsigned long stack;
 
        /*
         * Stack frames below this one aren't interesting.  Don't show them
         * if we're printing for %current.
         */
        if (!sp && (!task || task == current)) {
-               sp = &stack;
-               bp = stack_frame(current, NULL);
+               sp = get_stack_pointer(current, NULL);
+               bp = (unsigned long)get_frame_pointer(current, NULL);
        }
 
        show_stack_log_lvl(task, NULL, sp, bp, "");
@@ -207,7 +185,7 @@ void show_stack(struct task_struct *task, unsigned long *sp)
 
 void show_stack_regs(struct pt_regs *regs)
 {
-       show_stack_log_lvl(current, regs, (unsigned long *)regs->sp, regs->bp, "");
+       show_stack_log_lvl(current, regs, NULL, 0, "");
 }
 
 static arch_spinlock_t die_lock = __ARCH_SPIN_LOCK_UNLOCKED;
index 09675712eba807a8470962c53102ad1e7dc32afa..c533b8b5a373e0924ced05c4f15803699012e4a9 100644 (file)
@@ -46,19 +46,9 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
        int graph = 0;
        u32 *prev_esp;
 
-       if (!task)
-               task = current;
-
-       if (!stack) {
-               unsigned long dummy;
-
-               stack = &dummy;
-               if (task != current)
-                       stack = (unsigned long *)task->thread.sp;
-       }
-
-       if (!bp)
-               bp = stack_frame(task, regs);
+       task = task ? : current;
+       stack = stack ? : get_stack_pointer(task, regs);
+       bp = bp ? : (unsigned long)get_frame_pointer(task, regs);
 
        for (;;) {
                void *end_stack;
@@ -95,14 +85,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
        unsigned long *stack;
        int i;
 
-       if (sp == NULL) {
-               if (regs)
-                       sp = (unsigned long *)regs->sp;
-               else if (task)
-                       sp = (unsigned long *)task->thread.sp;
-               else
-                       sp = (unsigned long *)&sp;
-       }
+       sp = sp ? : get_stack_pointer(task, regs);
 
        stack = sp;
        for (i = 0; i < kstack_depth_to_print; i++) {
@@ -139,7 +122,7 @@ void show_regs(struct pt_regs *regs)
                u8 *ip;
 
                pr_emerg("Stack:\n");
-               show_stack_log_lvl(NULL, regs, &regs->sp, 0, KERN_EMERG);
+               show_stack_log_lvl(NULL, regs, NULL, 0, KERN_EMERG);
 
                pr_emerg("Code:");
 
index 9ee4520ce83c8fe0dd127a50e7022010f86414ca..b243352c779e663b695d460e0aad2d0ae313ac1b 100644 (file)
@@ -103,9 +103,6 @@ in_irq_stack(unsigned long *stack, unsigned long *irq_stack,
        return (stack >= irq_stack && stack < irq_stack_end);
 }
 
-static const unsigned long irq_stack_size =
-       (IRQ_STACK_SIZE - 64) / sizeof(unsigned long);
-
 enum stack_type {
        STACK_IS_UNKNOWN,
        STACK_IS_NORMAL,
@@ -133,7 +130,7 @@ analyze_stack(int cpu, struct task_struct *task, unsigned long *stack,
                return STACK_IS_NORMAL;
 
        *stack_end = irq_stack;
-       irq_stack = irq_stack - irq_stack_size;
+       irq_stack -= (IRQ_STACK_SIZE / sizeof(long));
 
        if (in_irq_stack(stack, irq_stack, *stack_end))
                return STACK_IS_IRQ;
@@ -154,25 +151,14 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
 {
        const unsigned cpu = get_cpu();
        unsigned long *irq_stack = (unsigned long *)per_cpu(irq_stack_ptr, cpu);
-       unsigned long dummy;
        unsigned used = 0;
        int graph = 0;
        int done = 0;
 
-       if (!task)
-               task = current;
-
-       if (!stack) {
-               if (regs)
-                       stack = (unsigned long *)regs->sp;
-               else if (task != current)
-                       stack = (unsigned long *)task->thread.sp;
-               else
-                       stack = &dummy;
-       }
+       task = task ? : current;
+       stack = stack ? : get_stack_pointer(task, regs);
+       bp = bp ? : (unsigned long)get_frame_pointer(task, regs);
 
-       if (!bp)
-               bp = stack_frame(task, regs);
        /*
         * Print function call entries in all stacks, starting at the
         * current stack address. If the stacks consist of nested
@@ -202,7 +188,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
 
                        bp = ops->walk_stack(task, stack, bp, ops,
                                             data, stack_end, &graph);
-                       ops->stack(data, "<EOE>");
+                       ops->stack(data, "EOE");
                        /*
                         * We link to the next stack via the
                         * second-to-last pointer (index -2 to end) in the
@@ -256,21 +242,10 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
        preempt_disable();
        cpu = smp_processor_id();
 
-       irq_stack_end   = (unsigned long *)(per_cpu(irq_stack_ptr, cpu));
-       irq_stack       = (unsigned long *)(per_cpu(irq_stack_ptr, cpu) - IRQ_STACK_SIZE);
+       irq_stack_end = (unsigned long *)(per_cpu(irq_stack_ptr, cpu));
+       irq_stack     = irq_stack_end - (IRQ_STACK_SIZE / sizeof(long));
 
-       /*
-        * Debugging aid: "show_stack(NULL, NULL);" prints the
-        * back trace for this cpu:
-        */
-       if (sp == NULL) {
-               if (regs)
-                       sp = (unsigned long *)regs->sp;
-               else if (task)
-                       sp = (unsigned long *)task->thread.sp;
-               else
-                       sp = (unsigned long *)&sp;
-       }
+       sp = sp ? : get_stack_pointer(task, regs);
 
        stack = sp;
        for (i = 0; i < kstack_depth_to_print; i++) {
@@ -308,9 +283,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
 void show_regs(struct pt_regs *regs)
 {
        int i;
-       unsigned long sp;
 
-       sp = regs->sp;
        show_regs_print_info(KERN_DEFAULT);
        __show_regs(regs, 1);
 
@@ -325,8 +298,7 @@ void show_regs(struct pt_regs *regs)
                u8 *ip;
 
                printk(KERN_DEFAULT "Stack:\n");
-               show_stack_log_lvl(NULL, regs, (unsigned long *)sp,
-                                  0, KERN_DEFAULT);
+               show_stack_log_lvl(NULL, regs, NULL, 0, KERN_DEFAULT);
 
                printk(KERN_DEFAULT "Code: ");
 
index 621b501f89351146b84b090b7160166bb9d5907e..871f1863457dccdf2fdcdf310aae774ab1b3c9d2 100644 (file)
@@ -388,11 +388,11 @@ static int __init __append_e820_map(struct e820entry *biosmap, int nr_map)
        while (nr_map) {
                u64 start = biosmap->addr;
                u64 size = biosmap->size;
-               u64 end = start + size;
+               u64 end = start + size - 1;
                u32 type = biosmap->type;
 
                /* Overflow in 64 bits? Ignore the memory map. */
-               if (start > end)
+               if (start > end && likely(size))
                        return -1;
 
                e820_add_region(start, size, type);
index 3fc03a09a93b1710b966a91ba8ae65750abc2f95..47004010ad5dd42ec03e5ca075d832ef0c925e38 100644 (file)
@@ -12,6 +12,7 @@
 #include <asm/traps.h>
 
 #include <linux/hardirq.h>
+#include <linux/pkeys.h>
 
 #define CREATE_TRACE_POINTS
 #include <asm/trace/fpu.h>
@@ -505,6 +506,9 @@ static inline void copy_init_fpstate_to_fpregs(void)
                copy_kernel_to_fxregs(&init_fpstate.fxsave);
        else
                copy_kernel_to_fregs(&init_fpstate.fsave);
+
+       if (boot_cpu_has(X86_FEATURE_OSPKE))
+               copy_init_pkru_to_fpregs();
 }
 
 /*
index 01567aa87503f021cfb3b4104520842b1ec87179..124aa5c593f8da7aba6643bc609a332744d10ba6 100644 (file)
@@ -5,6 +5,7 @@
  */
 #include <linux/compat.h>
 #include <linux/cpu.h>
+#include <linux/mman.h>
 #include <linux/pkeys.h>
 
 #include <asm/fpu/api.h>
@@ -866,9 +867,10 @@ const void *get_xsave_field_ptr(int xsave_state)
        return get_xsave_addr(&fpu->state.xsave, xsave_state);
 }
 
+#ifdef CONFIG_ARCH_HAS_PKEYS
+
 #define NR_VALID_PKRU_BITS (CONFIG_NR_PROTECTION_KEYS * 2)
 #define PKRU_VALID_MASK (NR_VALID_PKRU_BITS - 1)
-
 /*
  * This will go out and modify PKRU register to set the access
  * rights for @pkey to @init_val.
@@ -914,6 +916,7 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
 
        return 0;
 }
+#endif /* ! CONFIG_ARCH_HAS_PKEYS */
 
 /*
  * This is similar to user_regset_copyout(), but will not add offset to
index d036cfb4495db30015db71eee64cf7f633603a2b..8639bb2ae05868ab65d88e44683f44c8651121f3 100644 (file)
@@ -1029,7 +1029,7 @@ void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent,
        }
 
        if (ftrace_push_return_trace(old, self_addr, &trace.depth,
-                   frame_pointer) == -EBUSY) {
+                                    frame_pointer, parent) == -EBUSY) {
                *parent = old;
                return;
        }
index 4707baf9420347eaf808484de967282081f4ec8c..b6b2f0264af36ac272537e5d6689a38361bb60d4 100644 (file)
@@ -95,7 +95,7 @@ RESERVE_BRK(pagetables, INIT_MAP_SIZE)
  */
 __HEAD
 ENTRY(startup_32)
-       movl pa(stack_start),%ecx
+       movl pa(initial_stack),%ecx
        
        /* test KEEP_SEGMENTS flag to see if the bootloader is asking
                us to not reload segments */
@@ -287,7 +287,7 @@ num_subarch_entries = (. - subarch_entries) / 4
  * start_secondary().
  */
 ENTRY(start_cpu0)
-       movl stack_start, %ecx
+       movl initial_stack, %ecx
        movl %ecx, %esp
        jmp  *(initial_code)
 ENDPROC(start_cpu0)
@@ -308,7 +308,7 @@ ENTRY(startup_32_smp)
        movl %eax,%es
        movl %eax,%fs
        movl %eax,%gs
-       movl pa(stack_start),%ecx
+       movl pa(initial_stack),%ecx
        movl %eax,%ss
        leal -__PAGE_OFFSET(%ecx),%esp
 
@@ -705,7 +705,7 @@ ENTRY(initial_page_table)
 
 .data
 .balign 4
-ENTRY(stack_start)
+ENTRY(initial_stack)
        .long init_thread_union+THREAD_SIZE
 
 __INITRODATA
index 537d913f45ecbdf9c397920ce91bc493a888a9ca..b4421cc191b056727f8f8c0def78a750b319a1c4 100644 (file)
@@ -67,7 +67,7 @@ startup_64:
         */
 
        /*
-        * Setup stack for verify_cpu(). "-8" because stack_start is defined
+        * Setup stack for verify_cpu(). "-8" because initial_stack is defined
         * this way, see below. Our best guess is a NULL ptr for stack
         * termination heuristics and we don't want to break anything which
         * might depend on it (kgdb, ...).
@@ -227,7 +227,7 @@ ENTRY(secondary_startup_64)
        movq    %rax, %cr0
 
        /* Setup a boot time stack */
-       movq stack_start(%rip), %rsp
+       movq initial_stack(%rip), %rsp
 
        /* zero EFLAGS after setting rsp */
        pushq $0
@@ -311,7 +311,7 @@ ENDPROC(secondary_startup_64)
  * start_secondary().
  */
 ENTRY(start_cpu0)
-       movq stack_start(%rip),%rsp
+       movq initial_stack(%rip),%rsp
        movq    initial_code(%rip),%rax
        pushq   $0              # fake return address to stop unwinder
        pushq   $__KERNEL_CS    # set correct cs
@@ -320,17 +320,15 @@ ENTRY(start_cpu0)
 ENDPROC(start_cpu0)
 #endif
 
-       /* SMP bootup changes these two */
+       /* Both SMP bootup and ACPI suspend change these variables */
        __REFDATA
        .balign 8
        GLOBAL(initial_code)
        .quad   x86_64_start_kernel
        GLOBAL(initial_gs)
        .quad   INIT_PER_CPU_VAR(irq_stack_union)
-
-       GLOBAL(stack_start)
+       GLOBAL(initial_stack)
        .quad  init_thread_union+THREAD_SIZE-8
-       .word  0
        __FINITDATA
 
 bad_address:
index c6dfd801df973039e8c00f1d48ad4058713c914d..274fab99169d8235628b13fa955fc98b520ee27f 100644 (file)
@@ -756,10 +756,104 @@ static void hpet_reserve_msi_timers(struct hpet_data *hd)
 /*
  * Clock source related code
  */
+#if defined(CONFIG_SMP) && defined(CONFIG_64BIT)
+/*
+ * Reading the HPET counter is a very slow operation. If a large number of
+ * CPUs are trying to access the HPET counter simultaneously, it can cause
+ * massive delay and slow down system performance dramatically. This may
+ * happen when HPET is the default clock source instead of TSC. For a
+ * really large system with hundreds of CPUs, the slowdown may be so
+ * severe that it may actually crash the system because of a NMI watchdog
+ * soft lockup, for example.
+ *
+ * If multiple CPUs are trying to access the HPET counter at the same time,
+ * we don't actually need to read the counter multiple times. Instead, the
+ * other CPUs can use the counter value read by the first CPU in the group.
+ *
+ * This special feature is only enabled on x86-64 systems. It is unlikely
+ * that 32-bit x86 systems will have enough CPUs to require this feature
+ * with its associated locking overhead. And we also need 64-bit atomic
+ * read.
+ *
+ * The lock and the hpet value are stored together and can be read in a
+ * single atomic 64-bit read. It is explicitly assumed that arch_spinlock_t
+ * is 32 bits in size.
+ */
+union hpet_lock {
+       struct {
+               arch_spinlock_t lock;
+               u32 value;
+       };
+       u64 lockval;
+};
+
+static union hpet_lock hpet __cacheline_aligned = {
+       { .lock = __ARCH_SPIN_LOCK_UNLOCKED, },
+};
+
+static cycle_t read_hpet(struct clocksource *cs)
+{
+       unsigned long flags;
+       union hpet_lock old, new;
+
+       BUILD_BUG_ON(sizeof(union hpet_lock) != 8);
+
+       /*
+        * Read HPET directly if in NMI.
+        */
+       if (in_nmi())
+               return (cycle_t)hpet_readl(HPET_COUNTER);
+
+       /*
+        * Read the current state of the lock and HPET value atomically.
+        */
+       old.lockval = READ_ONCE(hpet.lockval);
+
+       if (arch_spin_is_locked(&old.lock))
+               goto contended;
+
+       local_irq_save(flags);
+       if (arch_spin_trylock(&hpet.lock)) {
+               new.value = hpet_readl(HPET_COUNTER);
+               /*
+                * Use WRITE_ONCE() to prevent store tearing.
+                */
+               WRITE_ONCE(hpet.value, new.value);
+               arch_spin_unlock(&hpet.lock);
+               local_irq_restore(flags);
+               return (cycle_t)new.value;
+       }
+       local_irq_restore(flags);
+
+contended:
+       /*
+        * Contended case
+        * --------------
+        * Wait until the HPET value change or the lock is free to indicate
+        * its value is up-to-date.
+        *
+        * It is possible that old.value has already contained the latest
+        * HPET value while the lock holder was in the process of releasing
+        * the lock. Checking for lock state change will enable us to return
+        * the value immediately instead of waiting for the next HPET reader
+        * to come along.
+        */
+       do {
+               cpu_relax();
+               new.lockval = READ_ONCE(hpet.lockval);
+       } while ((new.value == old.value) && arch_spin_is_locked(&new.lock));
+
+       return (cycle_t)new.value;
+}
+#else
+/*
+ * For UP or 32-bit.
+ */
 static cycle_t read_hpet(struct clocksource *cs)
 {
        return (cycle_t)hpet_readl(HPET_COUNTER);
 }
+#endif
 
 static struct clocksource clocksource_hpet = {
        .name           = "hpet",
index 04cde527d72849be75ccb65e0d7ed650a1ef3a82..8e36f249646e25d20bc2bcc04b7a0ccc292e498c 100644 (file)
@@ -50,6 +50,7 @@
 #include <asm/apicdef.h>
 #include <asm/apic.h>
 #include <asm/nmi.h>
+#include <asm/switch_to.h>
 
 struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] =
 {
@@ -166,21 +167,19 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
        gdb_regs[GDB_DX]        = 0;
        gdb_regs[GDB_SI]        = 0;
        gdb_regs[GDB_DI]        = 0;
-       gdb_regs[GDB_BP]        = *(unsigned long *)p->thread.sp;
+       gdb_regs[GDB_BP]        = ((struct inactive_task_frame *)p->thread.sp)->bp;
 #ifdef CONFIG_X86_32
        gdb_regs[GDB_DS]        = __KERNEL_DS;
        gdb_regs[GDB_ES]        = __KERNEL_DS;
        gdb_regs[GDB_PS]        = 0;
        gdb_regs[GDB_CS]        = __KERNEL_CS;
-       gdb_regs[GDB_PC]        = p->thread.ip;
        gdb_regs[GDB_SS]        = __KERNEL_DS;
        gdb_regs[GDB_FS]        = 0xFFFF;
        gdb_regs[GDB_GS]        = 0xFFFF;
 #else
-       gdb_regs32[GDB_PS]      = *(unsigned long *)(p->thread.sp + 8);
+       gdb_regs32[GDB_PS]      = 0;
        gdb_regs32[GDB_CS]      = __KERNEL_CS;
        gdb_regs32[GDB_SS]      = __KERNEL_DS;
-       gdb_regs[GDB_PC]        = 0;
        gdb_regs[GDB_R8]        = 0;
        gdb_regs[GDB_R9]        = 0;
        gdb_regs[GDB_R10]       = 0;
@@ -190,6 +189,7 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
        gdb_regs[GDB_R14]       = 0;
        gdb_regs[GDB_R15]       = 0;
 #endif
+       gdb_regs[GDB_PC]        = 0;
        gdb_regs[GDB_SP]        = p->thread.sp;
 }
 
index c2bedaea11f7b929709e64979e17e15ca4d8d60a..4afc67f5facc497606e2fb898ba7b7d402f04caa 100644 (file)
@@ -184,7 +184,7 @@ out:
 
 static struct kobj_attribute type_attr = __ATTR_RO(type);
 
-static struct bin_attribute data_attr = {
+static struct bin_attribute data_attr __ro_after_init = {
        .attr = {
                .name = "data",
                .mode = S_IRUGO,
index 1726c4c12336d1ba184789683d92a442f51d24df..1f431f362dd545ef052751bf094c15f8fbc8246e 100644 (file)
@@ -423,12 +423,7 @@ static void __init kvm_smp_prepare_boot_cpu(void)
        kvm_spinlock_init();
 }
 
-static void kvm_guest_cpu_online(void *dummy)
-{
-       kvm_guest_cpu_init();
-}
-
-static void kvm_guest_cpu_offline(void *dummy)
+static void kvm_guest_cpu_offline(void)
 {
        kvm_disable_steal_time();
        if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
@@ -437,29 +432,21 @@ static void kvm_guest_cpu_offline(void *dummy)
        apf_task_wake_all();
 }
 
-static int kvm_cpu_notify(struct notifier_block *self, unsigned long action,
-                         void *hcpu)
+static int kvm_cpu_online(unsigned int cpu)
 {
-       int cpu = (unsigned long)hcpu;
-       switch (action) {
-       case CPU_ONLINE:
-       case CPU_DOWN_FAILED:
-       case CPU_ONLINE_FROZEN:
-               smp_call_function_single(cpu, kvm_guest_cpu_online, NULL, 0);
-               break;
-       case CPU_DOWN_PREPARE:
-       case CPU_DOWN_PREPARE_FROZEN:
-               smp_call_function_single(cpu, kvm_guest_cpu_offline, NULL, 1);
-               break;
-       default:
-               break;
-       }
-       return NOTIFY_OK;
+       local_irq_disable();
+       kvm_guest_cpu_init();
+       local_irq_enable();
+       return 0;
 }
 
-static struct notifier_block kvm_cpu_notifier = {
-        .notifier_call  = kvm_cpu_notify,
-};
+static int kvm_cpu_down_prepare(unsigned int cpu)
+{
+       local_irq_disable();
+       kvm_guest_cpu_offline();
+       local_irq_enable();
+       return 0;
+}
 #endif
 
 static void __init kvm_apf_trap_init(void)
@@ -494,7 +481,9 @@ void __init kvm_guest_init(void)
 
 #ifdef CONFIG_SMP
        smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
-       register_cpu_notifier(&kvm_cpu_notifier);
+       if (cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/kvm:online",
+                                     kvm_cpu_online, kvm_cpu_down_prepare) < 0)
+               pr_err("kvm_guest: Failed to install cpu hotplug callbacks\n");
 #else
        kvm_guest_cpu_init();
 #endif
index 1d39bfbd26bb3d0918271f13af86c94385983150..0964399ef942125580e84fe124b5817ed52d0e9f 100644 (file)
@@ -29,7 +29,7 @@
 #include <asm/x86_init.h>
 #include <asm/reboot.h>
 
-static int kvmclock = 1;
+static int kvmclock __ro_after_init = 1;
 static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME;
 static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK;
 static cycle_t kvm_sched_clock_offset;
index 068c4a929de6335eed9e4737f069f30029fe5b73..0f8d204973838071d5ce936a97b7f5a0b524055e 100644 (file)
@@ -499,6 +499,9 @@ void __init default_get_smp_config(unsigned int early)
 {
        struct mpf_intel *mpf = mpf_found;
 
+       if (!smp_found_config)
+               return;
+
        if (!mpf)
                return;
 
index 1acfd76e3e26b73d5e874d3e48fb2e0b9efa9ca8..bef340082d207f51acad6e4e3501aaffe15b95cb 100644 (file)
@@ -389,7 +389,7 @@ NOKPROBE_SYMBOL(native_load_idt);
 #define PTE_IDENT      __PV_IS_CALLEE_SAVE(_paravirt_ident_64)
 #endif
 
-struct pv_mmu_ops pv_mmu_ops = {
+struct pv_mmu_ops pv_mmu_ops __ro_after_init = {
 
        .read_cr2 = native_read_cr2,
        .write_cr2 = native_write_cr2,
index 62c0b0ea2ce4483cb2ab29dc10492f877b2f93e7..c1fa790c81cd51a0cb5f5b1650a749c3b86300ee 100644 (file)
@@ -32,6 +32,7 @@
 #include <asm/tlbflush.h>
 #include <asm/mce.h>
 #include <asm/vm86.h>
+#include <asm/switch_to.h>
 
 /*
  * per-CPU TSS segments. Threads are completely 'soft' on Linux,
@@ -512,6 +513,17 @@ unsigned long arch_randomize_brk(struct mm_struct *mm)
        return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
 }
 
+/*
+ * Return saved PC of a blocked thread.
+ * What is this good for? it will be always the scheduler or ret_from_fork.
+ */
+unsigned long thread_saved_pc(struct task_struct *tsk)
+{
+       struct inactive_task_frame *frame =
+               (struct inactive_task_frame *) READ_ONCE(tsk->thread.sp);
+       return READ_ONCE_NOCHECK(frame->ret_addr);
+}
+
 /*
  * Called from fs/proc with a reference on @p to find the function
  * which called into schedule(). This needs to be done carefully
@@ -556,7 +568,7 @@ unsigned long get_wchan(struct task_struct *p)
        if (sp < bottom || sp > top)
                return 0;
 
-       fp = READ_ONCE_NOCHECK(*(unsigned long *)sp);
+       fp = READ_ONCE_NOCHECK(((struct inactive_task_frame *)sp)->bp);
        do {
                if (fp < bottom || fp > top)
                        return 0;
index d86be29c38c73c8bb8e128aa3fdeae53ad39b1c1..404efdfa083b45c9bf719181ab434d15121ed4c0 100644 (file)
 #include <asm/switch_to.h>
 #include <asm/vm86.h>
 
-asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
-asmlinkage void ret_from_kernel_thread(void) __asm__("ret_from_kernel_thread");
-
-/*
- * Return saved PC of a blocked thread.
- */
-unsigned long thread_saved_pc(struct task_struct *tsk)
-{
-       return ((unsigned long *)tsk->thread.sp)[3];
-}
-
 void __show_regs(struct pt_regs *regs, int all)
 {
        unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
@@ -133,35 +122,31 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
        unsigned long arg, struct task_struct *p, unsigned long tls)
 {
        struct pt_regs *childregs = task_pt_regs(p);
+       struct fork_frame *fork_frame = container_of(childregs, struct fork_frame, regs);
+       struct inactive_task_frame *frame = &fork_frame->frame;
        struct task_struct *tsk;
        int err;
 
-       p->thread.sp = (unsigned long) childregs;
+       frame->bp = 0;
+       frame->ret_addr = (unsigned long) ret_from_fork;
+       p->thread.sp = (unsigned long) fork_frame;
        p->thread.sp0 = (unsigned long) (childregs+1);
        memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
 
        if (unlikely(p->flags & PF_KTHREAD)) {
                /* kernel thread */
                memset(childregs, 0, sizeof(struct pt_regs));
-               p->thread.ip = (unsigned long) ret_from_kernel_thread;
-               task_user_gs(p) = __KERNEL_STACK_CANARY;
-               childregs->ds = __USER_DS;
-               childregs->es = __USER_DS;
-               childregs->fs = __KERNEL_PERCPU;
-               childregs->bx = sp;     /* function */
-               childregs->bp = arg;
-               childregs->orig_ax = -1;
-               childregs->cs = __KERNEL_CS | get_kernel_rpl();
-               childregs->flags = X86_EFLAGS_IF | X86_EFLAGS_FIXED;
+               frame->bx = sp;         /* function */
+               frame->di = arg;
                p->thread.io_bitmap_ptr = NULL;
                return 0;
        }
+       frame->bx = 0;
        *childregs = *current_pt_regs();
        childregs->ax = 0;
        if (sp)
                childregs->sp = sp;
 
-       p->thread.ip = (unsigned long) ret_from_fork;
        task_user_gs(p) = get_user_gs(current_pt_regs());
 
        p->thread.io_bitmap_ptr = NULL;
index 63236d8f84bf50c3291a9aa4591d3de24a500412..b812cd0d7889c6491f0ae8c6c75f633b51142324 100644 (file)
@@ -50,8 +50,6 @@
 #include <asm/switch_to.h>
 #include <asm/xen/hypervisor.h>
 
-asmlinkage extern void ret_from_fork(void);
-
 __visible DEFINE_PER_CPU(unsigned long, rsp_scratch);
 
 /* Prints also some state that isn't saved in the pt_regs */
@@ -141,12 +139,17 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
 {
        int err;
        struct pt_regs *childregs;
+       struct fork_frame *fork_frame;
+       struct inactive_task_frame *frame;
        struct task_struct *me = current;
 
        p->thread.sp0 = (unsigned long)task_stack_page(p) + THREAD_SIZE;
        childregs = task_pt_regs(p);
-       p->thread.sp = (unsigned long) childregs;
-       set_tsk_thread_flag(p, TIF_FORK);
+       fork_frame = container_of(childregs, struct fork_frame, regs);
+       frame = &fork_frame->frame;
+       frame->bp = 0;
+       frame->ret_addr = (unsigned long) ret_from_fork;
+       p->thread.sp = (unsigned long) fork_frame;
        p->thread.io_bitmap_ptr = NULL;
 
        savesegment(gs, p->thread.gsindex);
@@ -160,15 +163,11 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
        if (unlikely(p->flags & PF_KTHREAD)) {
                /* kernel thread */
                memset(childregs, 0, sizeof(struct pt_regs));
-               childregs->sp = (unsigned long)childregs;
-               childregs->ss = __KERNEL_DS;
-               childregs->bx = sp; /* function */
-               childregs->bp = arg;
-               childregs->orig_ax = -1;
-               childregs->cs = __KERNEL_CS | get_kernel_rpl();
-               childregs->flags = X86_EFLAGS_IF | X86_EFLAGS_FIXED;
+               frame->bx = sp;         /* function */
+               frame->r12 = arg;
                return 0;
        }
+       frame->bx = 0;
        *childregs = *current_pt_regs();
 
        childregs->ax = 0;
index f79576a541ffa21df086684eb2e5502cc5062248..5b88a1b26fc747cf0cdb533384a014c0ac9f3e93 100644 (file)
@@ -173,8 +173,8 @@ unsigned long kernel_stack_pointer(struct pt_regs *regs)
                return sp;
 
        prev_esp = (u32 *)(context);
-       if (prev_esp)
-               return (unsigned long)prev_esp;
+       if (*prev_esp)
+               return (unsigned long)*prev_esp;
 
        return (unsigned long)regs;
 }
@@ -1250,7 +1250,7 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
 
 #ifdef CONFIG_X86_64
 
-static struct user_regset x86_64_regsets[] __read_mostly = {
+static struct user_regset x86_64_regsets[] __ro_after_init = {
        [REGSET_GENERAL] = {
                .core_note_type = NT_PRSTATUS,
                .n = sizeof(struct user_regs_struct) / sizeof(long),
@@ -1291,7 +1291,7 @@ static const struct user_regset_view user_x86_64_view = {
 #endif /* CONFIG_X86_64 */
 
 #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
-static struct user_regset x86_32_regsets[] __read_mostly = {
+static struct user_regset x86_32_regsets[] __ro_after_init = {
        [REGSET_GENERAL] = {
                .core_note_type = NT_PRSTATUS,
                .n = sizeof(struct user_regs_struct32) / sizeof(u32),
@@ -1344,7 +1344,7 @@ static const struct user_regset_view user_x86_32_view = {
  */
 u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS];
 
-void update_regset_xstate_info(unsigned int size, u64 xstate_mask)
+void __init update_regset_xstate_info(unsigned int size, u64 xstate_mask)
 {
 #ifdef CONFIG_X86_64
        x86_64_regsets[REGSET_XSTATE].n = size / sizeof(u64);
index cc457ff818ad65c0ea92bb29257d0629141716fa..51402a7e4ca6ed040bd727477a0cee9c23426887 100644 (file)
@@ -626,3 +626,34 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3,
                        amd_disable_seq_and_redirect_scrub);
 
 #endif
+
+#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE)
+#include <linux/jump_label.h>
+#include <asm/string_64.h>
+
+/* Ivy Bridge, Haswell, Broadwell */
+static void quirk_intel_brickland_xeon_ras_cap(struct pci_dev *pdev)
+{
+       u32 capid0;
+
+       pci_read_config_dword(pdev, 0x84, &capid0);
+
+       if (capid0 & 0x10)
+               static_branch_inc(&mcsafe_key);
+}
+
+/* Skylake */
+static void quirk_intel_purley_xeon_ras_cap(struct pci_dev *pdev)
+{
+       u32 capid0;
+
+       pci_read_config_dword(pdev, 0x84, &capid0);
+
+       if ((capid0 & 0xc0) == 0xc0)
+               static_branch_inc(&mcsafe_key);
+}
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x0ec3, quirk_intel_brickland_xeon_ras_cap);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2fc0, quirk_intel_brickland_xeon_ras_cap);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fc0, quirk_intel_brickland_xeon_ras_cap);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2083, quirk_intel_purley_xeon_ras_cap);
+#endif
index 63bf27d972b77bfcb59997b473474ddbe41af5d0..e244c19a2451aa7d4dfa8bf34b1834e48f68ec54 100644 (file)
@@ -705,7 +705,7 @@ static void native_machine_power_off(void)
        tboot_shutdown(TB_SHUTDOWN_HALT);
 }
 
-struct machine_ops machine_ops = {
+struct machine_ops machine_ops __ro_after_init = {
        .power_off = native_machine_power_off,
        .shutdown = native_machine_shutdown,
        .emergency_restart = native_machine_emergency_restart,
index 0fa60f5f5a1641dc8eb2c82968dbadd650ed6e37..1489da81bb4d98a7391557e013b83b41a1ca6712 100644 (file)
@@ -210,9 +210,9 @@ EXPORT_SYMBOL(boot_cpu_data);
 
 
 #if !defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64)
-__visible unsigned long mmu_cr4_features;
+__visible unsigned long mmu_cr4_features __ro_after_init;
 #else
-__visible unsigned long mmu_cr4_features = X86_CR4_PAE;
+__visible unsigned long mmu_cr4_features __ro_after_init = X86_CR4_PAE;
 #endif
 
 /* Boot loader ID and version as integers, for the benefit of proc_dointvec */
@@ -1221,8 +1221,7 @@ void __init setup_arch(char **cmdline_p)
        /*
         * get boot-time SMP configuration:
         */
-       if (smp_found_config)
-               get_smp_config();
+       get_smp_config();
 
        prefill_possible_map();
 
index 7a40e068302d2a8a465e1621efc2024c2343e326..2bbd27f8980217bc443ac4d202d6658328061a81 100644 (file)
@@ -33,7 +33,7 @@ EXPORT_PER_CPU_SYMBOL(cpu_number);
 DEFINE_PER_CPU_READ_MOSTLY(unsigned long, this_cpu_off) = BOOT_PERCPU_OFFSET;
 EXPORT_PER_CPU_SYMBOL(this_cpu_off);
 
-unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = {
+unsigned long __per_cpu_offset[NR_CPUS] __ro_after_init = {
        [0 ... NR_CPUS-1] = BOOT_PERCPU_OFFSET,
 };
 EXPORT_SYMBOL(__per_cpu_offset);
@@ -246,7 +246,7 @@ void __init setup_per_cpu_areas(void)
 #ifdef CONFIG_X86_64
                per_cpu(irq_stack_ptr, cpu) =
                        per_cpu(irq_stack_union.irq_stack, cpu) +
-                       IRQ_STACK_SIZE - 64;
+                       IRQ_STACK_SIZE;
 #endif
 #ifdef CONFIG_NUMA
                per_cpu(x86_cpu_to_node_map, cpu) =
index 4296beb8fdd3b2d91a9644c9843d66dd2132c2ad..c53093a93eab6f6e52deae74b7db1b01df4b1459 100644 (file)
@@ -942,7 +942,6 @@ void common_cpu_up(unsigned int cpu, struct task_struct *idle)
        per_cpu(cpu_current_top_of_stack, cpu) =
                (unsigned long)task_stack_page(idle) + THREAD_SIZE;
 #else
-       clear_tsk_thread_flag(idle, TIF_FORK);
        initial_gs = per_cpu_offset(cpu);
 #endif
 }
@@ -969,7 +968,7 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
 
        early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
        initial_code = (unsigned long)start_secondary;
-       stack_start  = idle->thread.sp;
+       initial_stack  = idle->thread.sp;
 
        /*
         * Enable the espfix hack for this CPU
@@ -1115,17 +1114,8 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
 
        common_cpu_up(cpu, tidle);
 
-       /*
-        * We have to walk the irq descriptors to setup the vector
-        * space for the cpu which comes online.  Prevent irq
-        * alloc/free across the bringup.
-        */
-       irq_lock_sparse();
-
        err = do_boot_cpu(apicid, cpu, tidle);
-
        if (err) {
-               irq_unlock_sparse();
                pr_err("do_boot_cpu failed(%d) to wakeup CPU#%u\n", err, cpu);
                return -EIO;
        }
@@ -1143,8 +1133,6 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
                touch_nmi_watchdog();
        }
 
-       irq_unlock_sparse();
-
        return 0;
 }
 
@@ -1323,14 +1311,13 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
                break;
        }
 
-       default_setup_apic_routing();
-
        if (read_apic_id() != boot_cpu_physical_apicid) {
                panic("Boot APIC ID in local APIC unexpected (%d vs %d)",
                     read_apic_id(), boot_cpu_physical_apicid);
                /* Or can we switch back to PIC here? */
        }
 
+       default_setup_apic_routing();
        cpu0_logical_apicid = apic_bsp_setup(false);
 
        pr_info("CPU%d: ", 0);
index b70ca12dd389c57fd32222fa1279f0d46c39a6ac..bd4e3d4d3625ceeb4e4e356371feb444cea94fb6 100644 (file)
@@ -292,12 +292,30 @@ DO_ERROR(X86_TRAP_NP,     SIGBUS,  "segment not present", segment_not_present)
 DO_ERROR(X86_TRAP_SS,     SIGBUS,  "stack segment",            stack_segment)
 DO_ERROR(X86_TRAP_AC,     SIGBUS,  "alignment check",          alignment_check)
 
+#ifdef CONFIG_VMAP_STACK
+__visible void __noreturn handle_stack_overflow(const char *message,
+                                               struct pt_regs *regs,
+                                               unsigned long fault_address)
+{
+       printk(KERN_EMERG "BUG: stack guard page was hit at %p (stack is %p..%p)\n",
+                (void *)fault_address, current->stack,
+                (char *)current->stack + THREAD_SIZE - 1);
+       die(message, regs, 0);
+
+       /* Be absolutely certain we don't return. */
+       panic(message);
+}
+#endif
+
 #ifdef CONFIG_X86_64
 /* Runs on IST stack */
 dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
 {
        static const char str[] = "double fault";
        struct task_struct *tsk = current;
+#ifdef CONFIG_VMAP_STACK
+       unsigned long cr2;
+#endif
 
 #ifdef CONFIG_X86_ESPFIX64
        extern unsigned char native_irq_return_iret[];
@@ -332,6 +350,49 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
        tsk->thread.error_code = error_code;
        tsk->thread.trap_nr = X86_TRAP_DF;
 
+#ifdef CONFIG_VMAP_STACK
+       /*
+        * If we overflow the stack into a guard page, the CPU will fail
+        * to deliver #PF and will send #DF instead.  Similarly, if we
+        * take any non-IST exception while too close to the bottom of
+        * the stack, the processor will get a page fault while
+        * delivering the exception and will generate a double fault.
+        *
+        * According to the SDM (footnote in 6.15 under "Interrupt 14 -
+        * Page-Fault Exception (#PF):
+        *
+        *   Processors update CR2 whenever a page fault is detected. If a
+        *   second page fault occurs while an earlier page fault is being
+        *   deliv- ered, the faulting linear address of the second fault will
+        *   overwrite the contents of CR2 (replacing the previous
+        *   address). These updates to CR2 occur even if the page fault
+        *   results in a double fault or occurs during the delivery of a
+        *   double fault.
+        *
+        * The logic below has a small possibility of incorrectly diagnosing
+        * some errors as stack overflows.  For example, if the IDT or GDT
+        * gets corrupted such that #GP delivery fails due to a bad descriptor
+        * causing #GP and we hit this condition while CR2 coincidentally
+        * points to the stack guard page, we'll think we overflowed the
+        * stack.  Given that we're going to panic one way or another
+        * if this happens, this isn't necessarily worth fixing.
+        *
+        * If necessary, we could improve the test by only diagnosing
+        * a stack overflow if the saved RSP points within 47 bytes of
+        * the bottom of the stack: if RSP == tsk_stack + 48 and we
+        * take an exception, the stack is already aligned and there
+        * will be enough room SS, RSP, RFLAGS, CS, RIP, and a
+        * possible error code, so a stack overflow would *not* double
+        * fault.  With any less space left, exception delivery could
+        * fail, and, as a practical matter, we've overflowed the
+        * stack even if the actual trigger for the double fault was
+        * something else.
+        */
+       cr2 = read_cr2();
+       if ((unsigned long)task_stack_page(tsk) - 1 - cr2 < PAGE_SIZE)
+               handle_stack_overflow("kernel stack overflow (double-fault)", regs, cr2);
+#endif
+
 #ifdef CONFIG_DOUBLEFAULT
        df_debug(regs, error_code);
 #endif
index 76c5e52436c454b42602ce91758d4ec6acb8ef03..0bd9f1287f3962380f9f0f03470ce25f9ce8932b 100644 (file)
@@ -91,7 +91,7 @@ struct x86_cpuinit_ops x86_cpuinit = {
 static void default_nmi_init(void) { };
 static int default_i8042_detect(void) { return 1; };
 
-struct x86_platform_ops x86_platform = {
+struct x86_platform_ops x86_platform __ro_after_init = {
        .calibrate_cpu                  = native_calibrate_cpu,
        .calibrate_tsc                  = native_calibrate_tsc,
        .get_wallclock                  = mach_get_cmos_time,
@@ -108,7 +108,7 @@ struct x86_platform_ops x86_platform = {
 EXPORT_SYMBOL_GPL(x86_platform);
 
 #if defined(CONFIG_PCI_MSI)
-struct x86_msi_ops x86_msi = {
+struct x86_msi_ops x86_msi __ro_after_init = {
        .setup_msi_irqs         = native_setup_msi_irqs,
        .teardown_msi_irq       = native_teardown_msi_irq,
        .teardown_msi_irqs      = default_teardown_msi_irqs,
@@ -137,7 +137,7 @@ void arch_restore_msi_irqs(struct pci_dev *dev)
 }
 #endif
 
-struct x86_io_apic_ops x86_io_apic_ops = {
+struct x86_io_apic_ops x86_io_apic_ops __ro_after_init = {
        .read                   = native_io_apic_read,
        .disable                = native_disable_io_apic,
 };
index af523d84d102fde15d9194010ef7335495680670..1e6b84b96ea697d7a047f9c6583b42807209e952 100644 (file)
@@ -4961,7 +4961,7 @@ static inline void avic_post_state_restore(struct kvm_vcpu *vcpu)
        avic_handle_ldr_update(vcpu);
 }
 
-static struct kvm_x86_ops svm_x86_ops = {
+static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
        .cpu_has_kvm_support = has_svm,
        .disabled_by_bios = is_disabled,
        .hardware_setup = svm_hardware_setup,
index 5cede40e255241a1d8db2cc52f24505a5ced65ca..121fdf6e9ed09d15ef3c5abb8d00cf77a2ab5e78 100644 (file)
@@ -11177,7 +11177,7 @@ static void vmx_setup_mce(struct kvm_vcpu *vcpu)
                        ~FEATURE_CONTROL_LMCE;
 }
 
-static struct kvm_x86_ops vmx_x86_ops = {
+static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
        .cpu_has_kvm_support = cpu_has_kvm_support,
        .disabled_by_bios = vmx_disabled_by_bios,
        .hardware_setup = hardware_setup,
index 94c917af968801c643f396d64f051598f24e4e15..779782f5832476582becc24e5a0f0f5b10ea0b53 100644 (file)
@@ -184,11 +184,11 @@ ENDPROC(memcpy_orig)
 
 #ifndef CONFIG_UML
 /*
- * memcpy_mcsafe - memory copy with machine check exception handling
+ * memcpy_mcsafe_unrolled - memory copy with machine check exception handling
  * Note that we only catch machine checks when reading the source addresses.
  * Writes to target are posted and don't generate machine checks.
  */
-ENTRY(memcpy_mcsafe)
+ENTRY(memcpy_mcsafe_unrolled)
        cmpl $8, %edx
        /* Less than 8 bytes? Go to byte copy loop */
        jb .L_no_whole_words
@@ -276,8 +276,8 @@ ENTRY(memcpy_mcsafe)
 .L_done_memcpy_trap:
        xorq %rax, %rax
        ret
-ENDPROC(memcpy_mcsafe)
-EXPORT_SYMBOL_GPL(memcpy_mcsafe)
+ENDPROC(memcpy_mcsafe_unrolled)
+EXPORT_SYMBOL_GPL(memcpy_mcsafe_unrolled)
 
        .section .fixup, "ax"
        /* Return -EFAULT for any failure */
index ba47524f56e81682ef7e112d446060880726d080..d1c7de0958086f6ca3901fccf179fbd9873aac44 100644 (file)
@@ -52,21 +52,6 @@ static __init int find_northbridge(void)
        return -ENOENT;
 }
 
-static __init void early_get_boot_cpu_id(void)
-{
-       /*
-        * need to get the APIC ID of the BSP so can use that to
-        * create apicid_to_node in amd_scan_nodes()
-        */
-#ifdef CONFIG_X86_MPPARSE
-       /*
-        * get boot-time SMP configuration:
-        */
-       if (smp_found_config)
-               early_get_smp_config();
-#endif
-}
-
 int __init amd_numa_init(void)
 {
        u64 start = PFN_PHYS(0);
@@ -180,8 +165,11 @@ int __init amd_numa_init(void)
        cores = 1 << bits;
        apicid_base = 0;
 
-       /* get the APIC ID of the BSP early for systems with apicid lifting */
-       early_get_boot_cpu_id();
+       /*
+        * get boot-time SMP configuration:
+        */
+       early_get_smp_config();
+
        if (boot_cpu_physical_apicid > 0) {
                pr_info("BSP APIC ID: %02x\n", boot_cpu_physical_apicid);
                apicid_base = boot_cpu_physical_apicid;
index dc802306045653be7177fd619cdc3dffbcd92815..f0df3f06134c6291144e43e25e244cfaebac676f 100644 (file)
@@ -753,6 +753,38 @@ no_context(struct pt_regs *regs, unsigned long error_code,
                return;
        }
 
+#ifdef CONFIG_VMAP_STACK
+       /*
+        * Stack overflow?  During boot, we can fault near the initial
+        * stack in the direct map, but that's not an overflow -- check
+        * that we're in vmalloc space to avoid this.
+        */
+       if (is_vmalloc_addr((void *)address) &&
+           (((unsigned long)tsk->stack - 1 - address < PAGE_SIZE) ||
+            address - ((unsigned long)tsk->stack + THREAD_SIZE) < PAGE_SIZE)) {
+               register void *__sp asm("rsp");
+               unsigned long stack = this_cpu_read(orig_ist.ist[DOUBLEFAULT_STACK]) - sizeof(void *);
+               /*
+                * We're likely to be running with very little stack space
+                * left.  It's plausible that we'd hit this condition but
+                * double-fault even before we get this far, in which case
+                * we're fine: the double-fault handler will deal with it.
+                *
+                * We don't want to make it all the way into the oops code
+                * and then double-fault, though, because we're likely to
+                * break the console driver and lose most of the stack dump.
+                */
+               asm volatile ("movq %[stack], %%rsp\n\t"
+                             "call handle_stack_overflow\n\t"
+                             "1: jmp 1b"
+                             : "+r" (__sp)
+                             : "D" ("kernel stack overflow (page fault)"),
+                               "S" (regs), "d" (address),
+                               [stack] "rm" (stack));
+               unreachable();
+       }
+#endif
+
        /*
         * 32-bit:
         *
@@ -1112,6 +1144,15 @@ access_error(unsigned long error_code, struct vm_area_struct *vma)
 {
        /* This is only called for the current mm, so: */
        bool foreign = false;
+
+       /*
+        * Read or write was blocked by protection keys.  This is
+        * always an unconditional error and can never result in
+        * a follow-up action to resolve the fault, like a COW.
+        */
+       if (error_code & PF_PK)
+               return 1;
+
        /*
         * Make sure to check the VMA so that we do not perform
         * faults just to hit a PF_PK as soon as we fill in a
index bda8d5eef04d5594630114541d4b1ce043ff890a..ddd2661c4502922a63fbcd169615d34dddd6fcae 100644 (file)
  * You need to add an if/def entry if you introduce a new memory region
  * compatible with KASLR. Your entry must be in logical order with memory
  * layout. For example, ESPFIX is before EFI because its virtual address is
- * before. You also need to add a BUILD_BUG_ON in kernel_randomize_memory to
+ * before. You also need to add a BUILD_BUG_ON() in kernel_randomize_memory() to
  * ensure that this order is correct and won't be changed.
  */
 static const unsigned long vaddr_start = __PAGE_OFFSET_BASE;
-static const unsigned long vaddr_end = VMEMMAP_START;
+
+#if defined(CONFIG_X86_ESPFIX64)
+static const unsigned long vaddr_end = ESPFIX_BASE_ADDR;
+#elif defined(CONFIG_EFI)
+static const unsigned long vaddr_end = EFI_VA_START;
+#else
+static const unsigned long vaddr_end = __START_KERNEL_map;
+#endif
 
 /* Default values */
 unsigned long page_offset_base = __PAGE_OFFSET_BASE;
 EXPORT_SYMBOL(page_offset_base);
 unsigned long vmalloc_base = __VMALLOC_BASE;
 EXPORT_SYMBOL(vmalloc_base);
+unsigned long vmemmap_base = __VMEMMAP_BASE;
+EXPORT_SYMBOL(vmemmap_base);
 
 /*
  * Memory regions randomized by KASLR (except modules that use a separate logic
@@ -63,6 +72,7 @@ static __initdata struct kaslr_memory_region {
 } kaslr_regions[] = {
        { &page_offset_base, 64/* Maximum */ },
        { &vmalloc_base, VMALLOC_SIZE_TB },
+       { &vmemmap_base, 1 },
 };
 
 /* Get size in bytes used by the memory region */
@@ -89,6 +99,18 @@ void __init kernel_randomize_memory(void)
        struct rnd_state rand_state;
        unsigned long remain_entropy;
 
+       /*
+        * All these BUILD_BUG_ON checks ensures the memory layout is
+        * consistent with the vaddr_start/vaddr_end variables.
+        */
+       BUILD_BUG_ON(vaddr_start >= vaddr_end);
+       BUILD_BUG_ON(config_enabled(CONFIG_X86_ESPFIX64) &&
+                    vaddr_end >= EFI_VA_START);
+       BUILD_BUG_ON((config_enabled(CONFIG_X86_ESPFIX64) ||
+                     config_enabled(CONFIG_EFI)) &&
+                    vaddr_end >= __START_KERNEL_map);
+       BUILD_BUG_ON(vaddr_end > __START_KERNEL_map);
+
        if (!kaslr_memory_enabled())
                return;
 
index e8c474451928a709b46a80b55caf371f405f87b2..f88ce0e5efd99bcb1dff5d0de7e4329eca86c84e 100644 (file)
@@ -11,6 +11,7 @@
  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  * more details.
  */
+#include <linux/debugfs.h>             /* debugfs_create_u32()         */
 #include <linux/mm_types.h>             /* mm_struct, vma, etc...       */
 #include <linux/pkeys.h>                /* PKEY_*                       */
 #include <uapi/asm-generic/mman-common.h>
 
 int __execute_only_pkey(struct mm_struct *mm)
 {
+       bool need_to_set_mm_pkey = false;
+       int execute_only_pkey = mm->context.execute_only_pkey;
        int ret;
 
+       /* Do we need to assign a pkey for mm's execute-only maps? */
+       if (execute_only_pkey == -1) {
+               /* Go allocate one to use, which might fail */
+               execute_only_pkey = mm_pkey_alloc(mm);
+               if (execute_only_pkey < 0)
+                       return -1;
+               need_to_set_mm_pkey = true;
+       }
+
        /*
         * We do not want to go through the relatively costly
         * dance to set PKRU if we do not need to.  Check it
@@ -32,22 +44,33 @@ int __execute_only_pkey(struct mm_struct *mm)
         * can make fpregs inactive.
         */
        preempt_disable();
-       if (fpregs_active() &&
-           !__pkru_allows_read(read_pkru(), PKEY_DEDICATED_EXECUTE_ONLY)) {
+       if (!need_to_set_mm_pkey &&
+           fpregs_active() &&
+           !__pkru_allows_read(read_pkru(), execute_only_pkey)) {
                preempt_enable();
-               return PKEY_DEDICATED_EXECUTE_ONLY;
+               return execute_only_pkey;
        }
        preempt_enable();
-       ret = arch_set_user_pkey_access(current, PKEY_DEDICATED_EXECUTE_ONLY,
+
+       /*
+        * Set up PKRU so that it denies access for everything
+        * other than execution.
+        */
+       ret = arch_set_user_pkey_access(current, execute_only_pkey,
                        PKEY_DISABLE_ACCESS);
        /*
         * If the PKRU-set operation failed somehow, just return
         * 0 and effectively disable execute-only support.
         */
-       if (ret)
-               return 0;
+       if (ret) {
+               mm_set_pkey_free(mm, execute_only_pkey);
+               return -1;
+       }
 
-       return PKEY_DEDICATED_EXECUTE_ONLY;
+       /* We got one, store it and use it from here on out */
+       if (need_to_set_mm_pkey)
+               mm->context.execute_only_pkey = execute_only_pkey;
+       return execute_only_pkey;
 }
 
 static inline bool vma_is_pkey_exec_only(struct vm_area_struct *vma)
@@ -55,7 +78,7 @@ static inline bool vma_is_pkey_exec_only(struct vm_area_struct *vma)
        /* Do this check first since the vm_flags should be hot */
        if ((vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) != VM_EXEC)
                return false;
-       if (vma_pkey(vma) != PKEY_DEDICATED_EXECUTE_ONLY)
+       if (vma_pkey(vma) != vma->vm_mm->context.execute_only_pkey)
                return false;
 
        return true;
@@ -99,3 +122,106 @@ int __arch_override_mprotect_pkey(struct vm_area_struct *vma, int prot, int pkey
         */
        return vma_pkey(vma);
 }
+
+#define PKRU_AD_KEY(pkey)      (PKRU_AD_BIT << ((pkey) * PKRU_BITS_PER_PKEY))
+
+/*
+ * Make the default PKRU value (at execve() time) as restrictive
+ * as possible.  This ensures that any threads clone()'d early
+ * in the process's lifetime will not accidentally get access
+ * to data which is pkey-protected later on.
+ */
+u32 init_pkru_value = PKRU_AD_KEY( 1) | PKRU_AD_KEY( 2) | PKRU_AD_KEY( 3) |
+                     PKRU_AD_KEY( 4) | PKRU_AD_KEY( 5) | PKRU_AD_KEY( 6) |
+                     PKRU_AD_KEY( 7) | PKRU_AD_KEY( 8) | PKRU_AD_KEY( 9) |
+                     PKRU_AD_KEY(10) | PKRU_AD_KEY(11) | PKRU_AD_KEY(12) |
+                     PKRU_AD_KEY(13) | PKRU_AD_KEY(14) | PKRU_AD_KEY(15);
+
+/*
+ * Called from the FPU code when creating a fresh set of FPU
+ * registers.  This is called from a very specific context where
+ * we know the FPU regstiers are safe for use and we can use PKRU
+ * directly.  The fact that PKRU is only available when we are
+ * using eagerfpu mode makes this possible.
+ */
+void copy_init_pkru_to_fpregs(void)
+{
+       u32 init_pkru_value_snapshot = READ_ONCE(init_pkru_value);
+       /*
+        * Any write to PKRU takes it out of the XSAVE 'init
+        * state' which increases context switch cost.  Avoid
+        * writing 0 when PKRU was already 0.
+        */
+       if (!init_pkru_value_snapshot && !read_pkru())
+               return;
+       /*
+        * Override the PKRU state that came from 'init_fpstate'
+        * with the baseline from the process.
+        */
+       write_pkru(init_pkru_value_snapshot);
+}
+
+static ssize_t init_pkru_read_file(struct file *file, char __user *user_buf,
+                            size_t count, loff_t *ppos)
+{
+       char buf[32];
+       unsigned int len;
+
+       len = sprintf(buf, "0x%x\n", init_pkru_value);
+       return simple_read_from_buffer(user_buf, count, ppos, buf, len);
+}
+
+static ssize_t init_pkru_write_file(struct file *file,
+                const char __user *user_buf, size_t count, loff_t *ppos)
+{
+       char buf[32];
+       ssize_t len;
+       u32 new_init_pkru;
+
+       len = min(count, sizeof(buf) - 1);
+       if (copy_from_user(buf, user_buf, len))
+               return -EFAULT;
+
+       /* Make the buffer a valid string that we can not overrun */
+       buf[len] = '\0';
+       if (kstrtouint(buf, 0, &new_init_pkru))
+               return -EINVAL;
+
+       /*
+        * Don't allow insane settings that will blow the system
+        * up immediately if someone attempts to disable access
+        * or writes to pkey 0.
+        */
+       if (new_init_pkru & (PKRU_AD_BIT|PKRU_WD_BIT))
+               return -EINVAL;
+
+       WRITE_ONCE(init_pkru_value, new_init_pkru);
+       return count;
+}
+
+static const struct file_operations fops_init_pkru = {
+       .read = init_pkru_read_file,
+       .write = init_pkru_write_file,
+       .llseek = default_llseek,
+};
+
+static int __init create_init_pkru_value(void)
+{
+       debugfs_create_file("init_pkru", S_IRUSR | S_IWUSR,
+                       arch_debugfs_dir, NULL, &fops_init_pkru);
+       return 0;
+}
+late_initcall(create_init_pkru_value);
+
+static __init int setup_init_pkru(char *opt)
+{
+       u32 new_init_pkru;
+
+       if (kstrtouint(opt, 0, &new_init_pkru))
+               return 1;
+
+       WRITE_ONCE(init_pkru_value, new_init_pkru);
+
+       return 1;
+}
+__setup("init_pkru=", setup_init_pkru);
index 4dbe65622810208182bfe363efdd477a84abb040..a7655f6caf7dbd641dfea7c3e60581e01df594cb 100644 (file)
@@ -77,10 +77,25 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
        unsigned cpu = smp_processor_id();
 
        if (likely(prev != next)) {
+               if (IS_ENABLED(CONFIG_VMAP_STACK)) {
+                       /*
+                        * If our current stack is in vmalloc space and isn't
+                        * mapped in the new pgd, we'll double-fault.  Forcibly
+                        * map it.
+                        */
+                       unsigned int stack_pgd_index = pgd_index(current_stack_pointer());
+
+                       pgd_t *pgd = next->pgd + stack_pgd_index;
+
+                       if (unlikely(pgd_none(*pgd)))
+                               set_pgd(pgd, init_mm.pgd[stack_pgd_index]);
+               }
+
 #ifdef CONFIG_SMP
                this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
                this_cpu_write(cpu_tlbstate.active_mm, next);
 #endif
+
                cpumask_set_cpu(cpu, mm_cpumask(next));
 
                /*
index cb31a4440e588552f5fd046ae787c107aa09ec94..d950f9ea9a8c646029e24c45df66bf0cf1c96c44 100644 (file)
@@ -113,10 +113,14 @@ x86_backtrace(struct pt_regs * const regs, unsigned int depth)
        struct stack_frame *head = (struct stack_frame *)frame_pointer(regs);
 
        if (!user_mode(regs)) {
-               unsigned long stack = kernel_stack_pointer(regs);
-               if (depth)
-                       dump_trace(NULL, regs, (unsigned long *)stack, 0,
-                                  &backtrace_ops, &depth);
+               if (!depth)
+                       return;
+
+               oprofile_add_trace(regs->ip);
+               if (!--depth)
+                       return;
+
+               dump_trace(NULL, regs, NULL, 0, &backtrace_ops, &depth);
                return;
        }
 
index 9770e55e768fde446cff8482e7c903a5ba6a61e0..1d97cea3b3a44efb3152e32278573970c2c0c2c3 100644 (file)
@@ -120,9 +120,12 @@ static unsigned long __init bios32_service(unsigned long service)
 static struct {
        unsigned long address;
        unsigned short segment;
-} pci_indirect = { 0, __KERNEL_CS };
+} pci_indirect __ro_after_init = {
+       .address = 0,
+       .segment = __KERNEL_CS,
+};
 
-static int pci_bios_present;
+static int pci_bios_present __ro_after_init;
 
 static int __init check_pcibios(void)
 {
index 8ff7b9355416bdae9e8f587d989807a2582d97fc..d49d3be8195324a42de6c85657af187e6a5a2d41 100644 (file)
@@ -155,7 +155,7 @@ static void punit_dbgfs_unregister(void)
 
 static const struct x86_cpu_id intel_punit_cpu_ids[] = {
        ICPU(INTEL_FAM6_ATOM_SILVERMONT1, punit_device_byt),
-       ICPU(INTEL_FAM6_ATOM_MERRIFIELD1, punit_device_tng),
+       ICPU(INTEL_FAM6_ATOM_MERRIFIELD punit_device_tng),
        ICPU(INTEL_FAM6_ATOM_AIRMONT,     punit_device_cht),
        {}
 };
index fc135bf70511a6a4cd2a877ed696f907e2bad805..429d08be7848a2df6334aef7e5e2c4e79e4b6d4a 100644 (file)
@@ -1,5 +1,9 @@
 # Family-Level Interface Shim (FLIS)
 obj-$(subst m,y,$(CONFIG_PINCTRL_MERRIFIELD)) += platform_mrfld_pinctrl.o
+# SDHCI Devices
+obj-$(subst m,y,$(CONFIG_MMC_SDHCI_PCI)) += platform_mrfld_sd.o
+# WiFi
+obj-$(subst m,y,$(CONFIG_BRCMFMAC_SDIO)) += platform_bcm43xx.o
 # IPC Devices
 obj-y += platform_ipc.o
 obj-$(subst m,y,$(CONFIG_MFD_INTEL_MSIC)) += platform_msic.o
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_bcm43xx.c b/arch/x86/platform/intel-mid/device_libs/platform_bcm43xx.c
new file mode 100644 (file)
index 0000000..4392c15
--- /dev/null
@@ -0,0 +1,95 @@
+/*
+ * platform_bcm43xx.c: bcm43xx platform data initilization file
+ *
+ * (C) Copyright 2016 Intel Corporation
+ * Author: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/gpio.h>
+#include <linux/platform_device.h>
+#include <linux/regulator/machine.h>
+#include <linux/regulator/fixed.h>
+#include <linux/sfi.h>
+
+#include <asm/intel-mid.h>
+
+#define WLAN_SFI_GPIO_IRQ_NAME         "WLAN-interrupt"
+#define WLAN_SFI_GPIO_ENABLE_NAME      "WLAN-enable"
+
+#define WLAN_DEV_NAME                  "0000:00:01.3"
+
+static struct regulator_consumer_supply bcm43xx_vmmc_supply = {
+       .dev_name               = WLAN_DEV_NAME,
+       .supply                 = "vmmc",
+};
+
+static struct regulator_init_data bcm43xx_vmmc_data = {
+       .constraints = {
+               .valid_ops_mask         = REGULATOR_CHANGE_STATUS,
+       },
+       .num_consumer_supplies  = 1,
+       .consumer_supplies      = &bcm43xx_vmmc_supply,
+};
+
+static struct fixed_voltage_config bcm43xx_vmmc = {
+       .supply_name            = "bcm43xx-vmmc-regulator",
+       /*
+        * Announce 2.0V here to be compatible with SDIO specification. The
+        * real voltage and signaling are still 1.8V.
+        */
+       .microvolts             = 2000000,              /* 1.8V */
+       .gpio                   = -EINVAL,
+       .startup_delay          = 250 * 1000,           /* 250ms */
+       .enable_high            = 1,                    /* active high */
+       .enabled_at_boot        = 0,                    /* disabled at boot */
+       .init_data              = &bcm43xx_vmmc_data,
+};
+
+static struct platform_device bcm43xx_vmmc_regulator = {
+       .name           = "reg-fixed-voltage",
+       .id             = PLATFORM_DEVID_AUTO,
+       .dev = {
+               .platform_data  = &bcm43xx_vmmc,
+       },
+};
+
+static int __init bcm43xx_regulator_register(void)
+{
+       int ret;
+
+       bcm43xx_vmmc.gpio = get_gpio_by_name(WLAN_SFI_GPIO_ENABLE_NAME);
+       ret = platform_device_register(&bcm43xx_vmmc_regulator);
+       if (ret) {
+               pr_err("%s: vmmc regulator register failed\n", __func__);
+               return ret;
+       }
+
+       return 0;
+}
+
+static void __init *bcm43xx_platform_data(void *info)
+{
+       int ret;
+
+       ret = bcm43xx_regulator_register();
+       if (ret)
+               return NULL;
+
+       pr_info("Using generic wifi platform data\n");
+
+       /* For now it's empty */
+       return NULL;
+}
+
+static const struct devs_id bcm43xx_clk_vmmc_dev_id __initconst = {
+       .name                   = "bcm43xx_clk_vmmc",
+       .type                   = SFI_DEV_TYPE_SD,
+       .get_platform_data      = &bcm43xx_platform_data,
+};
+
+sfi_device(bcm43xx_clk_vmmc_dev_id);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_sd.c b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_sd.c
new file mode 100644 (file)
index 0000000..00c4a03
--- /dev/null
@@ -0,0 +1,47 @@
+/*
+ * SDHCI platform data initilisation file
+ *
+ * (C) Copyright 2016 Intel Corporation
+ * Author: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/init.h>
+#include <linux/pci.h>
+
+#include <linux/mmc/sdhci-pci-data.h>
+
+#include <asm/intel-mid.h>
+
+#define INTEL_MRFLD_SD                 2
+#define INTEL_MRFLD_SD_CD_GPIO         77
+
+static struct sdhci_pci_data mrfld_sdhci_pci_data = {
+       .rst_n_gpio     = -EINVAL,
+       .cd_gpio        = INTEL_MRFLD_SD_CD_GPIO,
+};
+
+static struct sdhci_pci_data *
+mrfld_sdhci_pci_get_data(struct pci_dev *pdev, int slotno)
+{
+       unsigned int func = PCI_FUNC(pdev->devfn);
+
+       if (func == INTEL_MRFLD_SD)
+               return &mrfld_sdhci_pci_data;
+
+       return NULL;
+}
+
+static int __init mrfld_sd_init(void)
+{
+       if (intel_mid_identify_cpu() != INTEL_MID_CPU_CHIP_TANGIER)
+               return -ENODEV;
+
+       sdhci_pci_get_data = mrfld_sdhci_pci_get_data;
+       return 0;
+}
+arch_initcall(mrfld_sd_init);
index ce119d2ba0d00edbe9b065b8773314a5d9814874..7850128f0026f2a045fa135173fe95963a927f14 100644 (file)
@@ -70,6 +70,11 @@ EXPORT_SYMBOL_GPL(__intel_mid_cpu_chip);
 
 static void intel_mid_power_off(void)
 {
+       /* Shut down South Complex via PWRMU */
+       intel_mid_pwr_power_off();
+
+       /* Only for Tangier, the rest will ignore this command */
+       intel_scu_ipc_simple_command(IPCMSG_COLD_OFF, 1);
 };
 
 static void intel_mid_reboot(void)
index c901a342377233390ee0cd9b255502e1dfdfe67a..5d3b45ad1c034d4ca922ccdd10ad7d2edb565384 100644 (file)
 /* Bits in PM_CMD */
 #define PM_CMD_CMD(x)          ((x) << 0)
 #define PM_CMD_IOC             (1 << 8)
-#define PM_CMD_D3cold          (1 << 21)
+#define PM_CMD_CM_NOP          (0 << 9)
+#define PM_CMD_CM_IMMEDIATE    (1 << 9)
+#define PM_CMD_CM_DELAY                (2 << 9)
+#define PM_CMD_CM_TRIGGER      (3 << 9)
+
+/* System states */
+#define PM_CMD_SYS_STATE_S5    (5 << 16)
+
+/* Trigger variants */
+#define PM_CMD_CFG_TRIGGER_NC  (3 << 19)
+
+/* Message to wait for TRIGGER_NC case */
+#define TRIGGER_NC_MSG_2       (2 << 22)
 
 /* List of commands */
 #define CMD_SET_CFG            0x01
@@ -137,7 +149,7 @@ static int mid_pwr_wait(struct mid_pwr *pwr)
 
 static int mid_pwr_wait_for_cmd(struct mid_pwr *pwr, u8 cmd)
 {
-       writel(PM_CMD_CMD(cmd), pwr->regs + PM_CMD);
+       writel(PM_CMD_CMD(cmd) | PM_CMD_CM_IMMEDIATE, pwr->regs + PM_CMD);
        return mid_pwr_wait(pwr);
 }
 
@@ -260,6 +272,20 @@ int intel_mid_pci_set_power_state(struct pci_dev *pdev, pci_power_t state)
 }
 EXPORT_SYMBOL_GPL(intel_mid_pci_set_power_state);
 
+void intel_mid_pwr_power_off(void)
+{
+       struct mid_pwr *pwr = midpwr;
+       u32 cmd = PM_CMD_SYS_STATE_S5 |
+                 PM_CMD_CMD(CMD_SET_CFG) |
+                 PM_CMD_CM_TRIGGER |
+                 PM_CMD_CFG_TRIGGER_NC |
+                 TRIGGER_NC_MSG_2;
+
+       /* Send command to SCU */
+       writel(cmd, pwr->regs + PM_CMD);
+       mid_pwr_wait(pwr);
+}
+
 int intel_mid_pwr_get_lss_id(struct pci_dev *pdev)
 {
        int vndr;
@@ -354,7 +380,7 @@ static int mid_pwr_probe(struct pci_dev *pdev, const struct pci_device_id *id)
        return 0;
 }
 
-static int mid_set_initial_state(struct mid_pwr *pwr)
+static int mid_set_initial_state(struct mid_pwr *pwr, const u32 *states)
 {
        unsigned int i, j;
        int ret;
@@ -379,10 +405,10 @@ static int mid_set_initial_state(struct mid_pwr *pwr)
         * NOTE: The actual device mapping is provided by a platform at run
         * time using vendor capability of PCI configuration space.
         */
-       mid_pwr_set_state(pwr, 0, 0xffffffff);
-       mid_pwr_set_state(pwr, 1, 0xffffffff);
-       mid_pwr_set_state(pwr, 2, 0xffffffff);
-       mid_pwr_set_state(pwr, 3, 0xffffffff);
+       mid_pwr_set_state(pwr, 0, states[0]);
+       mid_pwr_set_state(pwr, 1, states[1]);
+       mid_pwr_set_state(pwr, 2, states[2]);
+       mid_pwr_set_state(pwr, 3, states[3]);
 
        /* Send command to SCU */
        ret = mid_pwr_wait_for_cmd(pwr, CMD_SET_CFG);
@@ -397,13 +423,41 @@ static int mid_set_initial_state(struct mid_pwr *pwr)
        return 0;
 }
 
-static const struct mid_pwr_device_info mid_info = {
-       .set_initial_state = mid_set_initial_state,
+static int pnw_set_initial_state(struct mid_pwr *pwr)
+{
+       /* On Penwell SRAM must stay powered on */
+       const u32 states[] = {
+               0xf00fffff,             /* PM_SSC(0) */
+               0xffffffff,             /* PM_SSC(1) */
+               0xffffffff,             /* PM_SSC(2) */
+               0xffffffff,             /* PM_SSC(3) */
+       };
+       return mid_set_initial_state(pwr, states);
+}
+
+static int tng_set_initial_state(struct mid_pwr *pwr)
+{
+       const u32 states[] = {
+               0xffffffff,             /* PM_SSC(0) */
+               0xffffffff,             /* PM_SSC(1) */
+               0xffffffff,             /* PM_SSC(2) */
+               0xffffffff,             /* PM_SSC(3) */
+       };
+       return mid_set_initial_state(pwr, states);
+}
+
+static const struct mid_pwr_device_info pnw_info = {
+       .set_initial_state = pnw_set_initial_state,
+};
+
+static const struct mid_pwr_device_info tng_info = {
+       .set_initial_state = tng_set_initial_state,
 };
 
+/* This table should be in sync with the one in drivers/pci/pci-mid.c */
 static const struct pci_device_id mid_pwr_pci_ids[] = {
-       { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_PENWELL), (kernel_ulong_t)&mid_info },
-       { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_TANGIER), (kernel_ulong_t)&mid_info },
+       { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_PENWELL), (kernel_ulong_t)&pnw_info },
+       { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_TANGIER), (kernel_ulong_t)&tng_info },
        {}
 };
 
index b86ebb1a9a7f45cff133b5e99b90e34734252d16..bc9aaba01a223644303808f0e5b75956cf78eef0 100644 (file)
@@ -1925,6 +1925,45 @@ static void xen_set_cpu_features(struct cpuinfo_x86 *c)
        }
 }
 
+static void xen_pin_vcpu(int cpu)
+{
+       static bool disable_pinning;
+       struct sched_pin_override pin_override;
+       int ret;
+
+       if (disable_pinning)
+               return;
+
+       pin_override.pcpu = cpu;
+       ret = HYPERVISOR_sched_op(SCHEDOP_pin_override, &pin_override);
+
+       /* Ignore errors when removing override. */
+       if (cpu < 0)
+               return;
+
+       switch (ret) {
+       case -ENOSYS:
+               pr_warn("Unable to pin on physical cpu %d. In case of problems consider vcpu pinning.\n",
+                       cpu);
+               disable_pinning = true;
+               break;
+       case -EPERM:
+               WARN(1, "Trying to pin vcpu without having privilege to do so\n");
+               disable_pinning = true;
+               break;
+       case -EINVAL:
+       case -EBUSY:
+               pr_warn("Physical cpu %d not available for pinning. Check Xen cpu configuration.\n",
+                       cpu);
+               break;
+       case 0:
+               break;
+       default:
+               WARN(1, "rc %d while trying to pin vcpu\n", ret);
+               disable_pinning = true;
+       }
+}
+
 const struct hypervisor_x86 x86_hyper_xen = {
        .name                   = "Xen",
        .detect                 = xen_platform,
@@ -1933,6 +1972,7 @@ const struct hypervisor_x86 x86_hyper_xen = {
 #endif
        .x2apic_available       = xen_x2apic_para_available,
        .set_cpu_features       = xen_set_cpu_features,
+       .pin_vcpu               = xen_pin_vcpu,
 };
 EXPORT_SYMBOL(x86_hyper_xen);
 
index 9e079d49e7f2e678044d0cb527daec7cf70a293e..24365b30aae95adc240c5e38bd07edb7da03ba13 100644 (file)
 #define MAP_HUGE_SHIFT 26
 #define MAP_HUGE_MASK  0x3f
 
+#define PKEY_DISABLE_ACCESS    0x1
+#define PKEY_DISABLE_WRITE     0x2
+#define PKEY_ACCESS_MASK       (PKEY_DISABLE_ACCESS |\
+                                PKEY_DISABLE_WRITE)
+
 #endif /* _XTENSA_MMAN_H */
index 3797155bf2169584ab54bc4d6f1c84f9bdfad88e..70ebd769712bd29ae638fb13523c18ccf6e0659d 100644 (file)
@@ -40,10 +40,8 @@ int acpi_sysfs_init(void);
 void acpi_container_init(void);
 void acpi_memory_hotplug_init(void);
 #ifdef CONFIG_ACPI_HOTPLUG_IOAPIC
-int acpi_ioapic_add(struct acpi_pci_root *root);
 int acpi_ioapic_remove(struct acpi_pci_root *root);
 #else
-static inline int acpi_ioapic_add(struct acpi_pci_root *root) { return 0; }
 static inline int acpi_ioapic_remove(struct acpi_pci_root *root) { return 0; }
 #endif
 #ifdef CONFIG_ACPI_DOCK
index ccdc8db16bb896274beffc11f3e966327d6ac6e8..6d7ce6e12aaa6662b360c391f8c3b5e84a84ddc7 100644 (file)
@@ -46,7 +46,7 @@ static acpi_status setup_res(struct acpi_resource *acpi_res, void *data)
        struct resource_win win;
 
        res->flags = 0;
-       if (acpi_dev_filter_resource_type(acpi_res, IORESOURCE_MEM) == 0)
+       if (acpi_dev_filter_resource_type(acpi_res, IORESOURCE_MEM))
                return AE_OK;
 
        if (!acpi_dev_resource_memory(acpi_res, res)) {
@@ -97,7 +97,7 @@ static acpi_status handle_ioapic_add(acpi_handle handle, u32 lvl,
        unsigned long long gsi_base;
        struct acpi_pci_ioapic *ioapic;
        struct pci_dev *dev = NULL;
-       struct resource *res = NULL;
+       struct resource *res = NULL, *pci_res = NULL, *crs_res;
        char *type = NULL;
 
        if (!acpi_is_ioapic(handle, &type))
@@ -137,23 +137,30 @@ static acpi_status handle_ioapic_add(acpi_handle handle, u32 lvl,
                pci_set_master(dev);
                if (pci_request_region(dev, 0, type))
                        goto exit_disable;
-               res = &dev->resource[0];
+               pci_res = &dev->resource[0];
                ioapic->pdev = dev;
        } else {
                pci_dev_put(dev);
                dev = NULL;
+       }
 
-               res = &ioapic->res;
-               acpi_walk_resources(handle, METHOD_NAME__CRS, setup_res, res);
-               if (res->flags == 0) {
-                       acpi_handle_warn(handle, "failed to get resource\n");
-                       goto exit_free;
-               } else if (request_resource(&iomem_resource, res)) {
-                       acpi_handle_warn(handle, "failed to insert resource\n");
-                       goto exit_free;
-               }
+       crs_res = &ioapic->res;
+       acpi_walk_resources(handle, METHOD_NAME__CRS, setup_res, crs_res);
+       crs_res->name = type;
+       crs_res->flags |= IORESOURCE_BUSY;
+       if (crs_res->flags == 0) {
+               acpi_handle_warn(handle, "failed to get resource\n");
+               goto exit_release;
+       } else if (insert_resource(&iomem_resource, crs_res)) {
+               acpi_handle_warn(handle, "failed to insert resource\n");
+               goto exit_release;
        }
 
+       /* try pci resource first, then "_CRS" resource */
+       res = pci_res;
+       if (!res || !res->flags)
+               res = crs_res;
+
        if (acpi_register_ioapic(handle, res->start, (u32)gsi_base)) {
                acpi_handle_warn(handle, "failed to register IOAPIC\n");
                goto exit_release;
@@ -174,14 +181,13 @@ done:
 exit_release:
        if (dev)
                pci_release_region(dev, 0);
-       else
-               release_resource(res);
+       if (ioapic->res.flags && ioapic->res.parent)
+               release_resource(&ioapic->res);
 exit_disable:
        if (dev)
                pci_disable_device(dev);
 exit_put:
        pci_dev_put(dev);
-exit_free:
        kfree(ioapic);
 exit:
        mutex_unlock(&ioapic_list_lock);
@@ -189,13 +195,13 @@ exit:
        return AE_OK;
 }
 
-int acpi_ioapic_add(struct acpi_pci_root *root)
+int acpi_ioapic_add(acpi_handle root_handle)
 {
        acpi_status status, retval = AE_OK;
 
-       status = acpi_walk_namespace(ACPI_TYPE_DEVICE, root->device->handle,
+       status = acpi_walk_namespace(ACPI_TYPE_DEVICE, root_handle,
                                     UINT_MAX, handle_ioapic_add, NULL,
-                                    root->device->handle, (void **)&retval);
+                                    root_handle, (void **)&retval);
 
        return ACPI_SUCCESS(status) && ACPI_SUCCESS(retval) ? 0 : -ENODEV;
 }
@@ -217,9 +223,9 @@ int acpi_ioapic_remove(struct acpi_pci_root *root)
                        pci_release_region(ioapic->pdev, 0);
                        pci_disable_device(ioapic->pdev);
                        pci_dev_put(ioapic->pdev);
-               } else if (ioapic->res.flags && ioapic->res.parent) {
-                       release_resource(&ioapic->res);
                }
+               if (ioapic->res.flags && ioapic->res.parent)
+                       release_resource(&ioapic->res);
                list_del(&ioapic->list);
                kfree(ioapic);
        }
index d144168d4ef9dbfb66f45383ee62ab9bf00c0438..bf601d4df8cfcbb6e579b00cbff75efcb24d8071 100644 (file)
@@ -614,7 +614,17 @@ static int acpi_pci_root_add(struct acpi_device *device,
        if (hotadd) {
                pcibios_resource_survey_bus(root->bus);
                pci_assign_unassigned_root_bus_resources(root->bus);
-               acpi_ioapic_add(root);
+               /*
+                * This is only called for the hotadd case. For the boot-time
+                * case, we need to wait until after PCI initialization in
+                * order to deal with IOAPICs mapped in on a PCI BAR.
+                *
+                * This is currently x86-specific, because acpi_ioapic_add()
+                * is an empty function without CONFIG_ACPI_HOTPLUG_IOAPIC.
+                * And CONFIG_ACPI_HOTPLUG_IOAPIC depends on CONFIG_X86_IO_APIC
+                * (see drivers/acpi/Kconfig).
+                */
+               acpi_ioapic_add(root->device->handle);
        }
 
        pci_lock_rescan_remove();
index ffa7c9dcbd7a1be41ea07e4bf7e976fb494f65d6..890082315054d0c2043151a4ea25e107fe61b1bb 100644 (file)
@@ -144,15 +144,12 @@ struct cci_pmu {
        int num_cntrs;
        atomic_t active_events;
        struct mutex reserve_mutex;
-       struct list_head entry;
+       struct hlist_node node;
        cpumask_t cpus;
 };
 
 #define to_cci_pmu(c)  (container_of(c, struct cci_pmu, pmu))
 
-static DEFINE_MUTEX(cci_pmu_mutex);
-static LIST_HEAD(cci_pmu_list);
-
 enum cci_models {
 #ifdef CONFIG_ARM_CCI400_PMU
        CCI400_R0,
@@ -1506,25 +1503,21 @@ static int cci_pmu_init(struct cci_pmu *cci_pmu, struct platform_device *pdev)
        return perf_pmu_register(&cci_pmu->pmu, name, -1);
 }
 
-static int cci_pmu_offline_cpu(unsigned int cpu)
+static int cci_pmu_offline_cpu(unsigned int cpu, struct hlist_node *node)
 {
-       struct cci_pmu *cci_pmu;
+       struct cci_pmu *cci_pmu = hlist_entry_safe(node, struct cci_pmu, node);
        unsigned int target;
 
-       mutex_lock(&cci_pmu_mutex);
-       list_for_each_entry(cci_pmu, &cci_pmu_list, entry) {
-               if (!cpumask_test_and_clear_cpu(cpu, &cci_pmu->cpus))
-                       continue;
-               target = cpumask_any_but(cpu_online_mask, cpu);
-               if (target >= nr_cpu_ids)
-                       continue;
-               /*
-                * TODO: migrate context once core races on event->ctx have
-                * been fixed.
-                */
-               cpumask_set_cpu(target, &cci_pmu->cpus);
-       }
-       mutex_unlock(&cci_pmu_mutex);
+       if (!cpumask_test_and_clear_cpu(cpu, &cci_pmu->cpus))
+               return 0;
+       target = cpumask_any_but(cpu_online_mask, cpu);
+       if (target >= nr_cpu_ids)
+               return 0;
+       /*
+        * TODO: migrate context once core races on event->ctx have
+        * been fixed.
+        */
+       cpumask_set_cpu(target, &cci_pmu->cpus);
        return 0;
 }
 
@@ -1768,10 +1761,8 @@ static int cci_pmu_probe(struct platform_device *pdev)
        if (ret)
                return ret;
 
-       mutex_lock(&cci_pmu_mutex);
-       list_add(&cci_pmu->entry, &cci_pmu_list);
-       mutex_unlock(&cci_pmu_mutex);
-
+       cpuhp_state_add_instance_nocalls(CPUHP_AP_PERF_ARM_CCI_ONLINE,
+                                        &cci_pmu->node);
        pr_info("ARM %s PMU driver probed", cci_pmu->model->name);
        return 0;
 }
@@ -1804,9 +1795,9 @@ static int __init cci_platform_init(void)
 {
        int ret;
 
-       ret = cpuhp_setup_state_nocalls(CPUHP_AP_PERF_ARM_CCI_ONLINE,
-                                       "AP_PERF_ARM_CCI_ONLINE", NULL,
-                                       cci_pmu_offline_cpu);
+       ret = cpuhp_setup_state_multi(CPUHP_AP_PERF_ARM_CCI_ONLINE,
+                                     "AP_PERF_ARM_CCI_ONLINE", NULL,
+                                     cci_pmu_offline_cpu);
        if (ret)
                return ret;
 
index 884c0305e29033085ff4131434f29f07bd5d78ae..d1074d9b38ba163e624ea1b996df38ad2164f285 100644 (file)
@@ -167,7 +167,7 @@ struct arm_ccn_dt {
        struct hrtimer hrtimer;
 
        cpumask_t cpu;
-       struct list_head entry;
+       struct hlist_node node;
 
        struct pmu pmu;
 };
@@ -190,9 +190,6 @@ struct arm_ccn {
        int mn_id;
 };
 
-static DEFINE_MUTEX(arm_ccn_mutex);
-static LIST_HEAD(arm_ccn_list);
-
 static int arm_ccn_node_to_xp(int node)
 {
        return node / CCN_NUM_XP_PORTS;
@@ -1214,30 +1211,24 @@ static enum hrtimer_restart arm_ccn_pmu_timer_handler(struct hrtimer *hrtimer)
 }
 
 
-static int arm_ccn_pmu_offline_cpu(unsigned int cpu)
+static int arm_ccn_pmu_offline_cpu(unsigned int cpu, struct hlist_node *node)
 {
-       struct arm_ccn_dt *dt;
+       struct arm_ccn_dt *dt = hlist_entry_safe(node, struct arm_ccn_dt, node);
+       struct arm_ccn *ccn = container_of(dt, struct arm_ccn, dt);
        unsigned int target;
 
-       mutex_lock(&arm_ccn_mutex);
-       list_for_each_entry(dt, &arm_ccn_list, entry) {
-               struct arm_ccn *ccn = container_of(dt, struct arm_ccn, dt);
-
-               if (!cpumask_test_and_clear_cpu(cpu, &dt->cpu))
-                       continue;
-               target = cpumask_any_but(cpu_online_mask, cpu);
-               if (target >= nr_cpu_ids)
-                       continue;
-               perf_pmu_migrate_context(&dt->pmu, cpu, target);
-               cpumask_set_cpu(target, &dt->cpu);
-               if (ccn->irq)
-                       WARN_ON(irq_set_affinity_hint(ccn->irq, &dt->cpu) != 0);
-       }
-       mutex_unlock(&arm_ccn_mutex);
+       if (!cpumask_test_and_clear_cpu(cpu, &dt->cpu))
+               return 0;
+       target = cpumask_any_but(cpu_online_mask, cpu);
+       if (target >= nr_cpu_ids)
+               return 0;
+       perf_pmu_migrate_context(&dt->pmu, cpu, target);
+       cpumask_set_cpu(target, &dt->cpu);
+       if (ccn->irq)
+               WARN_ON(irq_set_affinity_hint(ccn->irq, &dt->cpu) != 0);
        return 0;
 }
 
-
 static DEFINE_IDA(arm_ccn_pmu_ida);
 
 static int arm_ccn_pmu_init(struct arm_ccn *ccn)
@@ -1321,9 +1312,8 @@ static int arm_ccn_pmu_init(struct arm_ccn *ccn)
        if (err)
                goto error_pmu_register;
 
-       mutex_lock(&arm_ccn_mutex);
-       list_add(&ccn->dt.entry, &arm_ccn_list);
-       mutex_unlock(&arm_ccn_mutex);
+       cpuhp_state_add_instance_nocalls(CPUHP_AP_PERF_ARM_CCN_ONLINE,
+                                        &ccn->dt.node);
        return 0;
 
 error_pmu_register:
@@ -1339,10 +1329,8 @@ static void arm_ccn_pmu_cleanup(struct arm_ccn *ccn)
 {
        int i;
 
-       mutex_lock(&arm_ccn_mutex);
-       list_del(&ccn->dt.entry);
-       mutex_unlock(&arm_ccn_mutex);
-
+       cpuhp_state_remove_instance_nocalls(CPUHP_AP_PERF_ARM_CCN_ONLINE,
+                                           &ccn->dt.node);
        if (ccn->irq)
                irq_set_affinity_hint(ccn->irq, NULL);
        for (i = 0; i < ccn->num_xps; i++)
@@ -1573,9 +1561,9 @@ static int __init arm_ccn_init(void)
 {
        int i, ret;
 
-       ret = cpuhp_setup_state_nocalls(CPUHP_AP_PERF_ARM_CCN_ONLINE,
-                                       "AP_PERF_ARM_CCN_ONLINE", NULL,
-                                       arm_ccn_pmu_offline_cpu);
+       ret = cpuhp_setup_state_multi(CPUHP_AP_PERF_ARM_CCN_ONLINE,
+                                     "AP_PERF_ARM_CCN_ONLINE", NULL,
+                                     arm_ccn_pmu_offline_cpu);
        if (ret)
                return ret;
 
@@ -1587,7 +1575,7 @@ static int __init arm_ccn_init(void)
 
 static void __exit arm_ccn_exit(void)
 {
-       cpuhp_remove_state_nocalls(CPUHP_AP_PERF_ARM_CCN_ONLINE);
+       cpuhp_remove_multi_state(CPUHP_AP_PERF_ARM_CCN_ONLINE);
        platform_driver_unregister(&arm_ccn_driver);
 }
 
index cad49bc38b3e7c7d7fd78c846fe51da7be2d1c49..1b14256376d240f094a3c57651661743a8576009 100644 (file)
@@ -596,19 +596,20 @@ BUILD_PERDEV_HELPER(cpu_down)       /* int mips_cdmm_cpu_down_helper(...) */
 BUILD_PERDEV_HELPER(cpu_up)         /* int mips_cdmm_cpu_up_helper(...) */
 
 /**
- * mips_cdmm_bus_down() - Tear down the CDMM bus.
- * @data:      Pointer to unsigned int CPU number.
+ * mips_cdmm_cpu_down_prep() - Callback for CPUHP DOWN_PREP:
+ *                            Tear down the CDMM bus.
+ * @cpu:       unsigned int CPU number.
  *
  * This function is executed on the hotplugged CPU and calls the CDMM
  * driver cpu_down callback for all devices on that CPU.
  */
-static long mips_cdmm_bus_down(void *data)
+static int mips_cdmm_cpu_down_prep(unsigned int cpu)
 {
        struct mips_cdmm_bus *bus;
        long ret;
 
        /* Inform all the devices on the bus */
-       ret = bus_for_each_dev(&mips_cdmm_bustype, NULL, data,
+       ret = bus_for_each_dev(&mips_cdmm_bustype, NULL, &cpu,
                               mips_cdmm_cpu_down_helper);
 
        /*
@@ -623,8 +624,8 @@ static long mips_cdmm_bus_down(void *data)
 }
 
 /**
- * mips_cdmm_bus_up() - Bring up the CDMM bus.
- * @data:      Pointer to unsigned int CPU number.
+ * mips_cdmm_cpu_online() - Callback for CPUHP ONLINE: Bring up the CDMM bus.
+ * @cpu:       unsigned int CPU number.
  *
  * This work_on_cpu callback function is executed on a given CPU to discover
  * CDMM devices on that CPU, or to call the CDMM driver cpu_up callback for all
@@ -634,7 +635,7 @@ static long mips_cdmm_bus_down(void *data)
  * initialisation. When CPUs are brought online the function is
  * invoked directly on the hotplugged CPU.
  */
-static long mips_cdmm_bus_up(void *data)
+static int mips_cdmm_cpu_online(unsigned int cpu)
 {
        struct mips_cdmm_bus *bus;
        long ret;
@@ -651,50 +652,12 @@ static long mips_cdmm_bus_up(void *data)
                mips_cdmm_bus_discover(bus);
        else
                /* Inform all the devices on the bus */
-               ret = bus_for_each_dev(&mips_cdmm_bustype, NULL, data,
+               ret = bus_for_each_dev(&mips_cdmm_bustype, NULL, &cpu,
                                       mips_cdmm_cpu_up_helper);
 
        return ret;
 }
 
-/**
- * mips_cdmm_cpu_notify() - Take action when a CPU is going online or offline.
- * @nb:                CPU notifier block .
- * @action:    Event that has taken place (CPU_*).
- * @data:      CPU number.
- *
- * This notifier is used to keep the CDMM buses updated as CPUs are offlined and
- * onlined. When CPUs go offline or come back online, so does their CDMM bus, so
- * devices must be informed. Also when CPUs come online for the first time the
- * devices on the CDMM bus need discovering.
- *
- * Returns:    NOTIFY_OK if event was used.
- *             NOTIFY_DONE if we didn't care.
- */
-static int mips_cdmm_cpu_notify(struct notifier_block *nb,
-                               unsigned long action, void *data)
-{
-       unsigned int cpu = (unsigned int)data;
-
-       switch (action & ~CPU_TASKS_FROZEN) {
-       case CPU_ONLINE:
-       case CPU_DOWN_FAILED:
-               mips_cdmm_bus_up(&cpu);
-               break;
-       case CPU_DOWN_PREPARE:
-               mips_cdmm_bus_down(&cpu);
-               break;
-       default:
-               return NOTIFY_DONE;
-       }
-
-       return NOTIFY_OK;
-}
-
-static struct notifier_block mips_cdmm_cpu_nb = {
-       .notifier_call = mips_cdmm_cpu_notify,
-};
-
 /**
  * mips_cdmm_init() - Initialise CDMM bus.
  *
@@ -703,7 +666,6 @@ static struct notifier_block mips_cdmm_cpu_nb = {
  */
 static int __init mips_cdmm_init(void)
 {
-       unsigned int cpu;
        int ret;
 
        /* Register the bus */
@@ -712,19 +674,11 @@ static int __init mips_cdmm_init(void)
                return ret;
 
        /* We want to be notified about new CPUs */
-       ret = register_cpu_notifier(&mips_cdmm_cpu_nb);
-       if (ret) {
+       ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "bus/cdmm:online",
+                               mips_cdmm_cpu_online, mips_cdmm_cpu_down_prep);
+       if (ret < 0)
                pr_warn("cdmm: Failed to register CPU notifier\n");
-               goto out;
-       }
-
-       /* Discover devices on CDMM of online CPUs */
-       for_each_online_cpu(cpu)
-               work_on_cpu(cpu, mips_cdmm_bus_up, &cpu);
 
-       return 0;
-out:
-       bus_unregister(&mips_cdmm_bustype);
        return ret;
 }
 subsys_initcall(mips_cdmm_init);
index d5657d50ac4044d5da886050e822517fe1134b6f..71e586d7df71722364f051ce0148479631c76fdb 100644 (file)
@@ -749,65 +749,52 @@ static void cpuidle_coupled_allow_idle(struct cpuidle_coupled *coupled)
        put_cpu();
 }
 
-/**
- * cpuidle_coupled_cpu_notify - notifier called during hotplug transitions
- * @nb: notifier block
- * @action: hotplug transition
- * @hcpu: target cpu number
- *
- * Called when a cpu is brought on or offline using hotplug.  Updates the
- * coupled cpu set appropriately
- */
-static int cpuidle_coupled_cpu_notify(struct notifier_block *nb,
-               unsigned long action, void *hcpu)
+static int coupled_cpu_online(unsigned int cpu)
 {
-       int cpu = (unsigned long)hcpu;
        struct cpuidle_device *dev;
 
-       switch (action & ~CPU_TASKS_FROZEN) {
-       case CPU_UP_PREPARE:
-       case CPU_DOWN_PREPARE:
-       case CPU_ONLINE:
-       case CPU_DEAD:
-       case CPU_UP_CANCELED:
-       case CPU_DOWN_FAILED:
-               break;
-       default:
-               return NOTIFY_OK;
-       }
-
        mutex_lock(&cpuidle_lock);
 
        dev = per_cpu(cpuidle_devices, cpu);
-       if (!dev || !dev->coupled)
-               goto out;
-
-       switch (action & ~CPU_TASKS_FROZEN) {
-       case CPU_UP_PREPARE:
-       case CPU_DOWN_PREPARE:
-               cpuidle_coupled_prevent_idle(dev->coupled);
-               break;
-       case CPU_ONLINE:
-       case CPU_DEAD:
+       if (dev && dev->coupled) {
                cpuidle_coupled_update_online_cpus(dev->coupled);
-               /* Fall through */
-       case CPU_UP_CANCELED:
-       case CPU_DOWN_FAILED:
                cpuidle_coupled_allow_idle(dev->coupled);
-               break;
        }
 
-out:
        mutex_unlock(&cpuidle_lock);
-       return NOTIFY_OK;
+       return 0;
 }
 
-static struct notifier_block cpuidle_coupled_cpu_notifier = {
-       .notifier_call = cpuidle_coupled_cpu_notify,
-};
+static int coupled_cpu_up_prepare(unsigned int cpu)
+{
+       struct cpuidle_device *dev;
+
+       mutex_lock(&cpuidle_lock);
+
+       dev = per_cpu(cpuidle_devices, cpu);
+       if (dev && dev->coupled)
+               cpuidle_coupled_prevent_idle(dev->coupled);
+
+       mutex_unlock(&cpuidle_lock);
+       return 0;
+}
 
 static int __init cpuidle_coupled_init(void)
 {
-       return register_cpu_notifier(&cpuidle_coupled_cpu_notifier);
+       int ret;
+
+       ret = cpuhp_setup_state_nocalls(CPUHP_CPUIDLE_COUPLED_PREPARE,
+                                       "cpuidle/coupled:prepare",
+                                       coupled_cpu_up_prepare,
+                                       coupled_cpu_online);
+       if (ret)
+               return ret;
+       ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
+                                       "cpuidle/coupled:online",
+                                       coupled_cpu_online,
+                                       coupled_cpu_up_prepare);
+       if (ret < 0)
+               cpuhp_remove_state_nocalls(CPUHP_CPUIDLE_COUPLED_PREPARE);
+       return ret;
 }
 core_initcall(cpuidle_coupled_init);
index f7ca891b5b59ff4681edad72bd5392b9f1081d1f..7fe442ca38f4b793cef0f70731a62e215d5d37fc 100644 (file)
@@ -119,40 +119,30 @@ static struct cpuidle_state powernv_states[CPUIDLE_STATE_MAX] = {
                .enter = snooze_loop },
 };
 
-static int powernv_cpuidle_add_cpu_notifier(struct notifier_block *n,
-                       unsigned long action, void *hcpu)
+static int powernv_cpuidle_cpu_online(unsigned int cpu)
 {
-       int hotcpu = (unsigned long)hcpu;
-       struct cpuidle_device *dev =
-                               per_cpu(cpuidle_devices, hotcpu);
+       struct cpuidle_device *dev = per_cpu(cpuidle_devices, cpu);
 
        if (dev && cpuidle_get_driver()) {
-               switch (action) {
-               case CPU_ONLINE:
-               case CPU_ONLINE_FROZEN:
-                       cpuidle_pause_and_lock();
-                       cpuidle_enable_device(dev);
-                       cpuidle_resume_and_unlock();
-                       break;
+               cpuidle_pause_and_lock();
+               cpuidle_enable_device(dev);
+               cpuidle_resume_and_unlock();
+       }
+       return 0;
+}
 
-               case CPU_DEAD:
-               case CPU_DEAD_FROZEN:
-                       cpuidle_pause_and_lock();
-                       cpuidle_disable_device(dev);
-                       cpuidle_resume_and_unlock();
-                       break;
+static int powernv_cpuidle_cpu_dead(unsigned int cpu)
+{
+       struct cpuidle_device *dev = per_cpu(cpuidle_devices, cpu);
 
-               default:
-                       return NOTIFY_DONE;
-               }
+       if (dev && cpuidle_get_driver()) {
+               cpuidle_pause_and_lock();
+               cpuidle_disable_device(dev);
+               cpuidle_resume_and_unlock();
        }
-       return NOTIFY_OK;
+       return 0;
 }
 
-static struct notifier_block setup_hotplug_notifier = {
-       .notifier_call = powernv_cpuidle_add_cpu_notifier,
-};
-
 /*
  * powernv_cpuidle_driver_init()
  */
@@ -355,7 +345,14 @@ static int __init powernv_processor_idle_init(void)
                return retval;
        }
 
-       register_cpu_notifier(&setup_hotplug_notifier);
+       retval = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
+                                          "cpuidle/powernv:online",
+                                          powernv_cpuidle_cpu_online, NULL);
+       WARN_ON(retval < 0);
+       retval = cpuhp_setup_state_nocalls(CPUHP_CPUIDLE_DEAD,
+                                          "cpuidle/powernv:dead", NULL,
+                                          powernv_cpuidle_cpu_dead);
+       WARN_ON(retval < 0);
        printk(KERN_DEBUG "powernv_idle_driver registered\n");
        return 0;
 }
index 07135e009d8b9ce7590c3b442d610c790a1ff206..166ccd711ec970c365c4743ed0e808a8049f0c63 100644 (file)
@@ -171,40 +171,30 @@ static struct cpuidle_state shared_states[] = {
                .enter = &shared_cede_loop },
 };
 
-static int pseries_cpuidle_add_cpu_notifier(struct notifier_block *n,
-                       unsigned long action, void *hcpu)
+static int pseries_cpuidle_cpu_online(unsigned int cpu)
 {
-       int hotcpu = (unsigned long)hcpu;
-       struct cpuidle_device *dev =
-                               per_cpu(cpuidle_devices, hotcpu);
+       struct cpuidle_device *dev = per_cpu(cpuidle_devices, cpu);
 
        if (dev && cpuidle_get_driver()) {
-               switch (action) {
-               case CPU_ONLINE:
-               case CPU_ONLINE_FROZEN:
-                       cpuidle_pause_and_lock();
-                       cpuidle_enable_device(dev);
-                       cpuidle_resume_and_unlock();
-                       break;
+               cpuidle_pause_and_lock();
+               cpuidle_enable_device(dev);
+               cpuidle_resume_and_unlock();
+       }
+       return 0;
+}
 
-               case CPU_DEAD:
-               case CPU_DEAD_FROZEN:
-                       cpuidle_pause_and_lock();
-                       cpuidle_disable_device(dev);
-                       cpuidle_resume_and_unlock();
-                       break;
+static int pseries_cpuidle_cpu_dead(unsigned int cpu)
+{
+       struct cpuidle_device *dev = per_cpu(cpuidle_devices, cpu);
 
-               default:
-                       return NOTIFY_DONE;
-               }
+       if (dev && cpuidle_get_driver()) {
+               cpuidle_pause_and_lock();
+               cpuidle_disable_device(dev);
+               cpuidle_resume_and_unlock();
        }
-       return NOTIFY_OK;
+       return 0;
 }
 
-static struct notifier_block setup_hotplug_notifier = {
-       .notifier_call = pseries_cpuidle_add_cpu_notifier,
-};
-
 /*
  * pseries_cpuidle_driver_init()
  */
@@ -273,7 +263,14 @@ static int __init pseries_processor_idle_init(void)
                return retval;
        }
 
-       register_cpu_notifier(&setup_hotplug_notifier);
+       retval = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
+                                          "cpuidle/pseries:online",
+                                          pseries_cpuidle_cpu_online, NULL);
+       WARN_ON(retval < 0);
+       retval = cpuhp_setup_state_nocalls(CPUHP_CPUIDLE_DEAD,
+                                          "cpuidle/pseries:DEAD", NULL,
+                                          pseries_cpuidle_cpu_dead);
+       WARN_ON(retval < 0);
        printk(KERN_DEBUG "pseries_idle_driver registered\n");
        return 0;
 }
index 829eec8959f24868a59ed8354160291878ebf68c..2fe1a130189f99d40b87bfe308bd47cc2817e51f 100644 (file)
@@ -23,6 +23,7 @@
 #include <linux/platform_device.h>
 #include <linux/dma-mapping.h>
 #include <linux/errno.h>
+#include <linux/cpu.h>
 #include <linux/gfp.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
@@ -238,33 +239,14 @@ static ssize_t host_control_on_shutdown_store(struct device *dev,
        return count;
 }
 
-/**
- * dcdbas_smi_request: generate SMI request
- *
- * Called with smi_data_lock.
- */
-int dcdbas_smi_request(struct smi_cmd *smi_cmd)
+static int raise_smi(void *par)
 {
-       cpumask_var_t old_mask;
-       int ret = 0;
-
-       if (smi_cmd->magic != SMI_CMD_MAGIC) {
-               dev_info(&dcdbas_pdev->dev, "%s: invalid magic value\n",
-                        __func__);
-               return -EBADR;
-       }
+       struct smi_cmd *smi_cmd = par;
 
-       /* SMI requires CPU 0 */
-       if (!alloc_cpumask_var(&old_mask, GFP_KERNEL))
-               return -ENOMEM;
-
-       cpumask_copy(old_mask, &current->cpus_allowed);
-       set_cpus_allowed_ptr(current, cpumask_of(0));
        if (smp_processor_id() != 0) {
                dev_dbg(&dcdbas_pdev->dev, "%s: failed to get CPU 0\n",
                        __func__);
-               ret = -EBUSY;
-               goto out;
+               return -EBUSY;
        }
 
        /* generate SMI */
@@ -280,9 +262,28 @@ int dcdbas_smi_request(struct smi_cmd *smi_cmd)
                : "memory"
        );
 
-out:
-       set_cpus_allowed_ptr(current, old_mask);
-       free_cpumask_var(old_mask);
+       return 0;
+}
+/**
+ * dcdbas_smi_request: generate SMI request
+ *
+ * Called with smi_data_lock.
+ */
+int dcdbas_smi_request(struct smi_cmd *smi_cmd)
+{
+       int ret;
+
+       if (smi_cmd->magic != SMI_CMD_MAGIC) {
+               dev_info(&dcdbas_pdev->dev, "%s: invalid magic value\n",
+                        __func__);
+               return -EBADR;
+       }
+
+       /* SMI requires CPU 0 */
+       get_online_cpus();
+       ret = smp_call_on_cpu(0, raise_smi, smi_cmd, true);
+       put_online_cpus();
+
        return ret;
 }
 
index 5a2631af7410782dc8f7ba993ab0b1139bf71abb..7dd2e2d372317f8a4ddb1fe464b29f662a001a28 100644 (file)
@@ -657,9 +657,12 @@ static int __init fdt_find_uefi_params(unsigned long node, const char *uname,
                }
 
                if (subnode) {
-                       node = of_get_flat_dt_subnode_by_name(node, subnode);
-                       if (node < 0)
+                       int err = of_get_flat_dt_subnode_by_name(node, subnode);
+
+                       if (err < 0)
                                return 0;
+
+                       node = err;
                }
 
                return __find_uefi_params(node, info, dt_params[i].params);
index 3bd127f953151dff89d0563db13deb12046e4315..aded10662020493390e29cb74a1910a4c51e4172 100644 (file)
@@ -41,6 +41,8 @@ static unsigned long __chunk_size = EFI_READ_CHUNK_SIZE;
 #define EFI_ALLOC_ALIGN                EFI_PAGE_SIZE
 #endif
 
+#define EFI_MMAP_NR_SLACK_SLOTS        8
+
 struct file_info {
        efi_file_handle_t *handle;
        u64 size;
@@ -63,49 +65,62 @@ void efi_printk(efi_system_table_t *sys_table_arg, char *str)
        }
 }
 
+static inline bool mmap_has_headroom(unsigned long buff_size,
+                                    unsigned long map_size,
+                                    unsigned long desc_size)
+{
+       unsigned long slack = buff_size - map_size;
+
+       return slack / desc_size >= EFI_MMAP_NR_SLACK_SLOTS;
+}
+
 efi_status_t efi_get_memory_map(efi_system_table_t *sys_table_arg,
-                               efi_memory_desc_t **map,
-                               unsigned long *map_size,
-                               unsigned long *desc_size,
-                               u32 *desc_ver,
-                               unsigned long *key_ptr)
+                               struct efi_boot_memmap *map)
 {
        efi_memory_desc_t *m = NULL;
        efi_status_t status;
        unsigned long key;
        u32 desc_version;
 
-       *map_size = sizeof(*m) * 32;
+       *map->desc_size =       sizeof(*m);
+       *map->map_size =        *map->desc_size * 32;
+       *map->buff_size =       *map->map_size;
 again:
-       /*
-        * Add an additional efi_memory_desc_t because we're doing an
-        * allocation which may be in a new descriptor region.
-        */
-       *map_size += sizeof(*m);
        status = efi_call_early(allocate_pool, EFI_LOADER_DATA,
-                               *map_size, (void **)&m);
+                               *map->map_size, (void **)&m);
        if (status != EFI_SUCCESS)
                goto fail;
 
-       *desc_size = 0;
+       *map->desc_size = 0;
        key = 0;
-       status = efi_call_early(get_memory_map, map_size, m,
-                               &key, desc_size, &desc_version);
-       if (status == EFI_BUFFER_TOO_SMALL) {
+       status = efi_call_early(get_memory_map, map->map_size, m,
+                               &key, map->desc_size, &desc_version);
+       if (status == EFI_BUFFER_TOO_SMALL ||
+           !mmap_has_headroom(*map->buff_size, *map->map_size,
+                              *map->desc_size)) {
                efi_call_early(free_pool, m);
+               /*
+                * Make sure there is some entries of headroom so that the
+                * buffer can be reused for a new map after allocations are
+                * no longer permitted.  Its unlikely that the map will grow to
+                * exceed this headroom once we are ready to trigger
+                * ExitBootServices()
+                */
+               *map->map_size += *map->desc_size * EFI_MMAP_NR_SLACK_SLOTS;
+               *map->buff_size = *map->map_size;
                goto again;
        }
 
        if (status != EFI_SUCCESS)
                efi_call_early(free_pool, m);
 
-       if (key_ptr && status == EFI_SUCCESS)
-               *key_ptr = key;
-       if (desc_ver && status == EFI_SUCCESS)
-               *desc_ver = desc_version;
+       if (map->key_ptr && status == EFI_SUCCESS)
+               *map->key_ptr = key;
+       if (map->desc_ver && status == EFI_SUCCESS)
+               *map->desc_ver = desc_version;
 
 fail:
-       *map = m;
+       *map->map = m;
        return status;
 }
 
@@ -113,13 +128,20 @@ fail:
 unsigned long get_dram_base(efi_system_table_t *sys_table_arg)
 {
        efi_status_t status;
-       unsigned long map_size;
+       unsigned long map_size, buff_size;
        unsigned long membase  = EFI_ERROR;
        struct efi_memory_map map;
        efi_memory_desc_t *md;
+       struct efi_boot_memmap boot_map;
 
-       status = efi_get_memory_map(sys_table_arg, (efi_memory_desc_t **)&map.map,
-                                   &map_size, &map.desc_size, NULL, NULL);
+       boot_map.map =          (efi_memory_desc_t **)&map.map;
+       boot_map.map_size =     &map_size;
+       boot_map.desc_size =    &map.desc_size;
+       boot_map.desc_ver =     NULL;
+       boot_map.key_ptr =      NULL;
+       boot_map.buff_size =    &buff_size;
+
+       status = efi_get_memory_map(sys_table_arg, &boot_map);
        if (status != EFI_SUCCESS)
                return membase;
 
@@ -144,15 +166,22 @@ efi_status_t efi_high_alloc(efi_system_table_t *sys_table_arg,
                            unsigned long size, unsigned long align,
                            unsigned long *addr, unsigned long max)
 {
-       unsigned long map_size, desc_size;
+       unsigned long map_size, desc_size, buff_size;
        efi_memory_desc_t *map;
        efi_status_t status;
        unsigned long nr_pages;
        u64 max_addr = 0;
        int i;
+       struct efi_boot_memmap boot_map;
+
+       boot_map.map =          &map;
+       boot_map.map_size =     &map_size;
+       boot_map.desc_size =    &desc_size;
+       boot_map.desc_ver =     NULL;
+       boot_map.key_ptr =      NULL;
+       boot_map.buff_size =    &buff_size;
 
-       status = efi_get_memory_map(sys_table_arg, &map, &map_size, &desc_size,
-                                   NULL, NULL);
+       status = efi_get_memory_map(sys_table_arg, &boot_map);
        if (status != EFI_SUCCESS)
                goto fail;
 
@@ -230,14 +259,21 @@ efi_status_t efi_low_alloc(efi_system_table_t *sys_table_arg,
                           unsigned long size, unsigned long align,
                           unsigned long *addr)
 {
-       unsigned long map_size, desc_size;
+       unsigned long map_size, desc_size, buff_size;
        efi_memory_desc_t *map;
        efi_status_t status;
        unsigned long nr_pages;
        int i;
+       struct efi_boot_memmap boot_map;
 
-       status = efi_get_memory_map(sys_table_arg, &map, &map_size, &desc_size,
-                                   NULL, NULL);
+       boot_map.map =          &map;
+       boot_map.map_size =     &map_size;
+       boot_map.desc_size =    &desc_size;
+       boot_map.desc_ver =     NULL;
+       boot_map.key_ptr =      NULL;
+       boot_map.buff_size =    &buff_size;
+
+       status = efi_get_memory_map(sys_table_arg, &boot_map);
        if (status != EFI_SUCCESS)
                goto fail;
 
@@ -704,3 +740,76 @@ char *efi_convert_cmdline(efi_system_table_t *sys_table_arg,
        *cmd_line_len = options_bytes;
        return (char *)cmdline_addr;
 }
+
+/*
+ * Handle calling ExitBootServices according to the requirements set out by the
+ * spec.  Obtains the current memory map, and returns that info after calling
+ * ExitBootServices.  The client must specify a function to perform any
+ * processing of the memory map data prior to ExitBootServices.  A client
+ * specific structure may be passed to the function via priv.  The client
+ * function may be called multiple times.
+ */
+efi_status_t efi_exit_boot_services(efi_system_table_t *sys_table_arg,
+                                   void *handle,
+                                   struct efi_boot_memmap *map,
+                                   void *priv,
+                                   efi_exit_boot_map_processing priv_func)
+{
+       efi_status_t status;
+
+       status = efi_get_memory_map(sys_table_arg, map);
+
+       if (status != EFI_SUCCESS)
+               goto fail;
+
+       status = priv_func(sys_table_arg, map, priv);
+       if (status != EFI_SUCCESS)
+               goto free_map;
+
+       status = efi_call_early(exit_boot_services, handle, *map->key_ptr);
+
+       if (status == EFI_INVALID_PARAMETER) {
+               /*
+                * The memory map changed between efi_get_memory_map() and
+                * exit_boot_services().  Per the UEFI Spec v2.6, Section 6.4:
+                * EFI_BOOT_SERVICES.ExitBootServices we need to get the
+                * updated map, and try again.  The spec implies one retry
+                * should be sufficent, which is confirmed against the EDK2
+                * implementation.  Per the spec, we can only invoke
+                * get_memory_map() and exit_boot_services() - we cannot alloc
+                * so efi_get_memory_map() cannot be used, and we must reuse
+                * the buffer.  For all practical purposes, the headroom in the
+                * buffer should account for any changes in the map so the call
+                * to get_memory_map() is expected to succeed here.
+                */
+               *map->map_size = *map->buff_size;
+               status = efi_call_early(get_memory_map,
+                                       map->map_size,
+                                       *map->map,
+                                       map->key_ptr,
+                                       map->desc_size,
+                                       map->desc_ver);
+
+               /* exit_boot_services() was called, thus cannot free */
+               if (status != EFI_SUCCESS)
+                       goto fail;
+
+               status = priv_func(sys_table_arg, map, priv);
+               /* exit_boot_services() was called, thus cannot free */
+               if (status != EFI_SUCCESS)
+                       goto fail;
+
+               status = efi_call_early(exit_boot_services, handle, *map->key_ptr);
+       }
+
+       /* exit_boot_services() was called, thus cannot free */
+       if (status != EFI_SUCCESS)
+               goto fail;
+
+       return EFI_SUCCESS;
+
+free_map:
+       efi_call_early(free_pool, *map->map);
+fail:
+       return status;
+}
index e58abfa953cc5a30f49e4bdbd5f2857c6bcf876d..a6a93116a8f053f6c14911376ffa6da7f1dff44e 100644 (file)
@@ -152,6 +152,27 @@ fdt_set_fail:
 #define EFI_FDT_ALIGN EFI_PAGE_SIZE
 #endif
 
+struct exit_boot_struct {
+       efi_memory_desc_t *runtime_map;
+       int *runtime_entry_count;
+};
+
+static efi_status_t exit_boot_func(efi_system_table_t *sys_table_arg,
+                                  struct efi_boot_memmap *map,
+                                  void *priv)
+{
+       struct exit_boot_struct *p = priv;
+       /*
+        * Update the memory map with virtual addresses. The function will also
+        * populate @runtime_map with copies of just the EFI_MEMORY_RUNTIME
+        * entries so that we can pass it straight to SetVirtualAddressMap()
+        */
+       efi_get_virtmap(*map->map, *map->map_size, *map->desc_size,
+                       p->runtime_map, p->runtime_entry_count);
+
+       return EFI_SUCCESS;
+}
+
 /*
  * Allocate memory for a new FDT, then add EFI, commandline, and
  * initrd related fields to the FDT.  This routine increases the
@@ -175,13 +196,22 @@ efi_status_t allocate_new_fdt_and_exit_boot(efi_system_table_t *sys_table,
                                            unsigned long fdt_addr,
                                            unsigned long fdt_size)
 {
-       unsigned long map_size, desc_size;
+       unsigned long map_size, desc_size, buff_size;
        u32 desc_ver;
        unsigned long mmap_key;
        efi_memory_desc_t *memory_map, *runtime_map;
        unsigned long new_fdt_size;
        efi_status_t status;
        int runtime_entry_count = 0;
+       struct efi_boot_memmap map;
+       struct exit_boot_struct priv;
+
+       map.map =       &runtime_map;
+       map.map_size =  &map_size;
+       map.desc_size = &desc_size;
+       map.desc_ver =  &desc_ver;
+       map.key_ptr =   &mmap_key;
+       map.buff_size = &buff_size;
 
        /*
         * Get a copy of the current memory map that we will use to prepare
@@ -189,8 +219,7 @@ efi_status_t allocate_new_fdt_and_exit_boot(efi_system_table_t *sys_table,
         * subsequent allocations adding entries, since they could not affect
         * the number of EFI_MEMORY_RUNTIME regions.
         */
-       status = efi_get_memory_map(sys_table, &runtime_map, &map_size,
-                                   &desc_size, &desc_ver, &mmap_key);
+       status = efi_get_memory_map(sys_table, &map);
        if (status != EFI_SUCCESS) {
                pr_efi_err(sys_table, "Unable to retrieve UEFI memory map.\n");
                return status;
@@ -199,6 +228,7 @@ efi_status_t allocate_new_fdt_and_exit_boot(efi_system_table_t *sys_table,
        pr_efi(sys_table,
               "Exiting boot services and installing virtual address map...\n");
 
+       map.map = &memory_map;
        /*
         * Estimate size of new FDT, and allocate memory for it. We
         * will allocate a bigger buffer if this ends up being too
@@ -218,8 +248,7 @@ efi_status_t allocate_new_fdt_and_exit_boot(efi_system_table_t *sys_table,
                 * we can get the memory map key  needed for
                 * exit_boot_services().
                 */
-               status = efi_get_memory_map(sys_table, &memory_map, &map_size,
-                                           &desc_size, &desc_ver, &mmap_key);
+               status = efi_get_memory_map(sys_table, &map);
                if (status != EFI_SUCCESS)
                        goto fail_free_new_fdt;
 
@@ -250,16 +279,11 @@ efi_status_t allocate_new_fdt_and_exit_boot(efi_system_table_t *sys_table,
                }
        }
 
-       /*
-        * Update the memory map with virtual addresses. The function will also
-        * populate @runtime_map with copies of just the EFI_MEMORY_RUNTIME
-        * entries so that we can pass it straight into SetVirtualAddressMap()
-        */
-       efi_get_virtmap(memory_map, map_size, desc_size, runtime_map,
-                       &runtime_entry_count);
-
-       /* Now we are ready to exit_boot_services.*/
-       status = sys_table->boottime->exit_boot_services(handle, mmap_key);
+       sys_table->boottime->free_pool(memory_map);
+       priv.runtime_map = runtime_map;
+       priv.runtime_entry_count = &runtime_entry_count;
+       status = efi_exit_boot_services(sys_table, handle, &map, &priv,
+                                       exit_boot_func);
 
        if (status == EFI_SUCCESS) {
                efi_set_virtual_address_map_t *svam;
index 53f6d3fe6d8621519634d3aa2299aeaf5f2ca90d..0c9f58c5ba501157f0c8826ac0d80b485d86b716 100644 (file)
@@ -73,12 +73,20 @@ efi_status_t efi_random_alloc(efi_system_table_t *sys_table_arg,
                              unsigned long random_seed)
 {
        unsigned long map_size, desc_size, total_slots = 0, target_slot;
+       unsigned long buff_size;
        efi_status_t status;
        efi_memory_desc_t *memory_map;
        int map_offset;
+       struct efi_boot_memmap map;
 
-       status = efi_get_memory_map(sys_table_arg, &memory_map, &map_size,
-                                   &desc_size, NULL, NULL);
+       map.map =       &memory_map;
+       map.map_size =  &map_size;
+       map.desc_size = &desc_size;
+       map.desc_ver =  NULL;
+       map.key_ptr =   NULL;
+       map.buff_size = &buff_size;
+
+       status = efi_get_memory_map(sys_table_arg, &map);
        if (status != EFI_SUCCESS)
                return status;
 
index acf9c0361d9f7e16f639fbbfeac642b1ca5258b5..34704b0451b49700b5bf5ce7eb47dea905680a17 100644 (file)
@@ -21,6 +21,7 @@
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
+#include <linux/cpu.h>
 #include <linux/delay.h>
 #include <linux/module.h>
 #include <linux/types.h>
@@ -36,6 +37,7 @@
 #include <linux/io.h>
 #include <linux/sched.h>
 #include <linux/ctype.h>
+#include <linux/smp.h>
 
 #include <linux/i8k.h>
 
@@ -134,11 +136,11 @@ static inline const char *i8k_get_dmi_data(int field)
 /*
  * Call the System Management Mode BIOS. Code provided by Jonathan Buzzard.
  */
-static int i8k_smm(struct smm_regs *regs)
+static int i8k_smm_func(void *par)
 {
        int rc;
+       struct smm_regs *regs = par;
        int eax = regs->eax;
-       cpumask_var_t old_mask;
 
 #ifdef DEBUG
        int ebx = regs->ebx;
@@ -149,16 +151,8 @@ static int i8k_smm(struct smm_regs *regs)
 #endif
 
        /* SMM requires CPU 0 */
-       if (!alloc_cpumask_var(&old_mask, GFP_KERNEL))
-               return -ENOMEM;
-       cpumask_copy(old_mask, &current->cpus_allowed);
-       rc = set_cpus_allowed_ptr(current, cpumask_of(0));
-       if (rc)
-               goto out;
-       if (smp_processor_id() != 0) {
-               rc = -EBUSY;
-               goto out;
-       }
+       if (smp_processor_id() != 0)
+               return -EBUSY;
 
 #if defined(CONFIG_X86_64)
        asm volatile("pushq %%rax\n\t"
@@ -216,10 +210,6 @@ static int i8k_smm(struct smm_regs *regs)
        if (rc != 0 || (regs->eax & 0xffff) == 0xffff || regs->eax == eax)
                rc = -EINVAL;
 
-out:
-       set_cpus_allowed_ptr(current, old_mask);
-       free_cpumask_var(old_mask);
-
 #ifdef DEBUG
        rettime = ktime_get();
        delta = ktime_sub(rettime, calltime);
@@ -231,6 +221,20 @@ out:
        return rc;
 }
 
+/*
+ * Call the System Management Mode BIOS.
+ */
+static int i8k_smm(struct smm_regs *regs)
+{
+       int ret;
+
+       get_online_cpus();
+       ret = smp_call_on_cpu(0, i8k_smm_func, regs, true);
+       put_online_cpus();
+
+       return ret;
+}
+
 /*
  * Read the fan status.
  */
index 5883ef0d95bf9a6f17eadb243fc40d61cc9909ed..766c3b7cf0b8f5278a4d79dc1c569625b6dd7697 100644 (file)
@@ -6349,22 +6349,20 @@ static int alloc_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu
        return 0;
 }
 
-static void raid5_free_percpu(struct r5conf *conf)
+static int raid456_cpu_dead(unsigned int cpu, struct hlist_node *node)
 {
-       unsigned long cpu;
+       struct r5conf *conf = hlist_entry_safe(node, struct r5conf, node);
+
+       free_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu));
+       return 0;
+}
 
+static void raid5_free_percpu(struct r5conf *conf)
+{
        if (!conf->percpu)
                return;
 
-#ifdef CONFIG_HOTPLUG_CPU
-       unregister_cpu_notifier(&conf->cpu_notify);
-#endif
-
-       get_online_cpus();
-       for_each_possible_cpu(cpu)
-               free_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu));
-       put_online_cpus();
-
+       cpuhp_state_remove_instance(CPUHP_MD_RAID5_PREPARE, &conf->node);
        free_percpu(conf->percpu);
 }
 
@@ -6383,64 +6381,28 @@ static void free_conf(struct r5conf *conf)
        kfree(conf);
 }
 
-#ifdef CONFIG_HOTPLUG_CPU
-static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action,
-                             void *hcpu)
+static int raid456_cpu_up_prepare(unsigned int cpu, struct hlist_node *node)
 {
-       struct r5conf *conf = container_of(nfb, struct r5conf, cpu_notify);
-       long cpu = (long)hcpu;
+       struct r5conf *conf = hlist_entry_safe(node, struct r5conf, node);
        struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu);
 
-       switch (action) {
-       case CPU_UP_PREPARE:
-       case CPU_UP_PREPARE_FROZEN:
-               if (alloc_scratch_buffer(conf, percpu)) {
-                       pr_err("%s: failed memory allocation for cpu%ld\n",
-                              __func__, cpu);
-                       return notifier_from_errno(-ENOMEM);
-               }
-               break;
-       case CPU_DEAD:
-       case CPU_DEAD_FROZEN:
-       case CPU_UP_CANCELED:
-       case CPU_UP_CANCELED_FROZEN:
-               free_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu));
-               break;
-       default:
-               break;
+       if (alloc_scratch_buffer(conf, percpu)) {
+               pr_err("%s: failed memory allocation for cpu%u\n",
+                      __func__, cpu);
+               return -ENOMEM;
        }
-       return NOTIFY_OK;
+       return 0;
 }
-#endif
 
 static int raid5_alloc_percpu(struct r5conf *conf)
 {
-       unsigned long cpu;
        int err = 0;
 
        conf->percpu = alloc_percpu(struct raid5_percpu);
        if (!conf->percpu)
                return -ENOMEM;
 
-#ifdef CONFIG_HOTPLUG_CPU
-       conf->cpu_notify.notifier_call = raid456_cpu_notify;
-       conf->cpu_notify.priority = 0;
-       err = register_cpu_notifier(&conf->cpu_notify);
-       if (err)
-               return err;
-#endif
-
-       get_online_cpus();
-       for_each_present_cpu(cpu) {
-               err = alloc_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu));
-               if (err) {
-                       pr_err("%s: failed memory allocation for cpu%ld\n",
-                              __func__, cpu);
-                       break;
-               }
-       }
-       put_online_cpus();
-
+       err = cpuhp_state_add_instance(CPUHP_MD_RAID5_PREPARE, &conf->node);
        if (!err) {
                conf->scribble_disks = max(conf->raid_disks,
                        conf->previous_raid_disks);
@@ -7987,10 +7949,21 @@ static struct md_personality raid4_personality =
 
 static int __init raid5_init(void)
 {
+       int ret;
+
        raid5_wq = alloc_workqueue("raid5wq",
                WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE|WQ_SYSFS, 0);
        if (!raid5_wq)
                return -ENOMEM;
+
+       ret = cpuhp_setup_state_multi(CPUHP_MD_RAID5_PREPARE,
+                                     "md/raid5:prepare",
+                                     raid456_cpu_up_prepare,
+                                     raid456_cpu_dead);
+       if (ret) {
+               destroy_workqueue(raid5_wq);
+               return ret;
+       }
        register_md_personality(&raid6_personality);
        register_md_personality(&raid5_personality);
        register_md_personality(&raid4_personality);
@@ -8002,6 +7975,7 @@ static void raid5_exit(void)
        unregister_md_personality(&raid6_personality);
        unregister_md_personality(&raid5_personality);
        unregister_md_personality(&raid4_personality);
+       cpuhp_remove_multi_state(CPUHP_MD_RAID5_PREPARE);
        destroy_workqueue(raid5_wq);
 }
 
index 517d4b68a1befcad9e4b383c0ed077133d25b3b3..57ec49f0839e53d18281fb7622497d446115dcc0 100644 (file)
@@ -512,9 +512,7 @@ struct r5conf {
        } __percpu *percpu;
        int scribble_disks;
        int scribble_sectors;
-#ifdef CONFIG_HOTPLUG_CPU
-       struct notifier_block   cpu_notify;
-#endif
+       struct hlist_node node;
 
        /*
         * Free stripes pool
index 8e4252dd9a9db7a406bda66321b2db4da631de5b..32f0cc4256b3eefb3052d32c84a9c5c4346e9d10 100644 (file)
@@ -382,7 +382,8 @@ struct mvneta_port {
        struct mvneta_rx_queue *rxqs;
        struct mvneta_tx_queue *txqs;
        struct net_device *dev;
-       struct notifier_block cpu_notifier;
+       struct hlist_node node_online;
+       struct hlist_node node_dead;
        int rxq_def;
        /* Protect the access to the percpu interrupt registers,
         * ensuring that the configuration remains coherent.
@@ -573,6 +574,7 @@ struct mvneta_rx_queue {
        int next_desc_to_proc;
 };
 
+static enum cpuhp_state online_hpstate;
 /* The hardware supports eight (8) rx queues, but we are only allowing
  * the first one to be used. Therefore, let's just allocate one queue.
  */
@@ -3313,101 +3315,104 @@ static void mvneta_percpu_elect(struct mvneta_port *pp)
        }
 };
 
-static int mvneta_percpu_notifier(struct notifier_block *nfb,
-                                 unsigned long action, void *hcpu)
+static int mvneta_cpu_online(unsigned int cpu, struct hlist_node *node)
 {
-       struct mvneta_port *pp = container_of(nfb, struct mvneta_port,
-                                             cpu_notifier);
-       int cpu = (unsigned long)hcpu, other_cpu;
+       int other_cpu;
+       struct mvneta_port *pp = hlist_entry_safe(node, struct mvneta_port,
+                                                 node_online);
        struct mvneta_pcpu_port *port = per_cpu_ptr(pp->ports, cpu);
 
-       switch (action) {
-       case CPU_ONLINE:
-       case CPU_ONLINE_FROZEN:
-       case CPU_DOWN_FAILED:
-       case CPU_DOWN_FAILED_FROZEN:
-               spin_lock(&pp->lock);
-               /* Configuring the driver for a new CPU while the
-                * driver is stopping is racy, so just avoid it.
-                */
-               if (pp->is_stopped) {
-                       spin_unlock(&pp->lock);
-                       break;
-               }
-               netif_tx_stop_all_queues(pp->dev);
 
-               /* We have to synchronise on tha napi of each CPU
-                * except the one just being waked up
-                */
-               for_each_online_cpu(other_cpu) {
-                       if (other_cpu != cpu) {
-                               struct mvneta_pcpu_port *other_port =
-                                       per_cpu_ptr(pp->ports, other_cpu);
+       spin_lock(&pp->lock);
+       /*
+        * Configuring the driver for a new CPU while the driver is
+        * stopping is racy, so just avoid it.
+        */
+       if (pp->is_stopped) {
+               spin_unlock(&pp->lock);
+               return 0;
+       }
+       netif_tx_stop_all_queues(pp->dev);
 
-                               napi_synchronize(&other_port->napi);
-                       }
+       /*
+        * We have to synchronise on tha napi of each CPU except the one
+        * just being woken up
+        */
+       for_each_online_cpu(other_cpu) {
+               if (other_cpu != cpu) {
+                       struct mvneta_pcpu_port *other_port =
+                               per_cpu_ptr(pp->ports, other_cpu);
+
+                       napi_synchronize(&other_port->napi);
                }
+       }
 
-               /* Mask all ethernet port interrupts */
-               on_each_cpu(mvneta_percpu_mask_interrupt, pp, true);
-               napi_enable(&port->napi);
+       /* Mask all ethernet port interrupts */
+       on_each_cpu(mvneta_percpu_mask_interrupt, pp, true);
+       napi_enable(&port->napi);
 
+       /*
+        * Enable per-CPU interrupts on the CPU that is
+        * brought up.
+        */
+       mvneta_percpu_enable(pp);
 
-               /* Enable per-CPU interrupts on the CPU that is
-                * brought up.
-                */
-               mvneta_percpu_enable(pp);
+       /*
+        * Enable per-CPU interrupt on the one CPU we care
+        * about.
+        */
+       mvneta_percpu_elect(pp);
 
-               /* Enable per-CPU interrupt on the one CPU we care
-                * about.
-                */
-               mvneta_percpu_elect(pp);
-
-               /* Unmask all ethernet port interrupts */
-               on_each_cpu(mvneta_percpu_unmask_interrupt, pp, true);
-               mvreg_write(pp, MVNETA_INTR_MISC_MASK,
-                       MVNETA_CAUSE_PHY_STATUS_CHANGE |
-                       MVNETA_CAUSE_LINK_CHANGE |
-                       MVNETA_CAUSE_PSC_SYNC_CHANGE);
-               netif_tx_start_all_queues(pp->dev);
-               spin_unlock(&pp->lock);
-               break;
-       case CPU_DOWN_PREPARE:
-       case CPU_DOWN_PREPARE_FROZEN:
-               netif_tx_stop_all_queues(pp->dev);
-               /* Thanks to this lock we are sure that any pending
-                * cpu election is done
-                */
-               spin_lock(&pp->lock);
-               /* Mask all ethernet port interrupts */
-               on_each_cpu(mvneta_percpu_mask_interrupt, pp, true);
-               spin_unlock(&pp->lock);
+       /* Unmask all ethernet port interrupts */
+       on_each_cpu(mvneta_percpu_unmask_interrupt, pp, true);
+       mvreg_write(pp, MVNETA_INTR_MISC_MASK,
+                   MVNETA_CAUSE_PHY_STATUS_CHANGE |
+                   MVNETA_CAUSE_LINK_CHANGE |
+                   MVNETA_CAUSE_PSC_SYNC_CHANGE);
+       netif_tx_start_all_queues(pp->dev);
+       spin_unlock(&pp->lock);
+       return 0;
+}
 
-               napi_synchronize(&port->napi);
-               napi_disable(&port->napi);
-               /* Disable per-CPU interrupts on the CPU that is
-                * brought down.
-                */
-               mvneta_percpu_disable(pp);
+static int mvneta_cpu_down_prepare(unsigned int cpu, struct hlist_node *node)
+{
+       struct mvneta_port *pp = hlist_entry_safe(node, struct mvneta_port,
+                                                 node_online);
+       struct mvneta_pcpu_port *port = per_cpu_ptr(pp->ports, cpu);
 
-               break;
-       case CPU_DEAD:
-       case CPU_DEAD_FROZEN:
-               /* Check if a new CPU must be elected now this on is down */
-               spin_lock(&pp->lock);
-               mvneta_percpu_elect(pp);
-               spin_unlock(&pp->lock);
-               /* Unmask all ethernet port interrupts */
-               on_each_cpu(mvneta_percpu_unmask_interrupt, pp, true);
-               mvreg_write(pp, MVNETA_INTR_MISC_MASK,
-                       MVNETA_CAUSE_PHY_STATUS_CHANGE |
-                       MVNETA_CAUSE_LINK_CHANGE |
-                       MVNETA_CAUSE_PSC_SYNC_CHANGE);
-               netif_tx_start_all_queues(pp->dev);
-               break;
-       }
+       /*
+        * Thanks to this lock we are sure that any pending cpu election is
+        * done.
+        */
+       spin_lock(&pp->lock);
+       /* Mask all ethernet port interrupts */
+       on_each_cpu(mvneta_percpu_mask_interrupt, pp, true);
+       spin_unlock(&pp->lock);
 
-       return NOTIFY_OK;
+       napi_synchronize(&port->napi);
+       napi_disable(&port->napi);
+       /* Disable per-CPU interrupts on the CPU that is brought down. */
+       mvneta_percpu_disable(pp);
+       return 0;
+}
+
+static int mvneta_cpu_dead(unsigned int cpu, struct hlist_node *node)
+{
+       struct mvneta_port *pp = hlist_entry_safe(node, struct mvneta_port,
+                                                 node_dead);
+
+       /* Check if a new CPU must be elected now this on is down */
+       spin_lock(&pp->lock);
+       mvneta_percpu_elect(pp);
+       spin_unlock(&pp->lock);
+       /* Unmask all ethernet port interrupts */
+       on_each_cpu(mvneta_percpu_unmask_interrupt, pp, true);
+       mvreg_write(pp, MVNETA_INTR_MISC_MASK,
+                   MVNETA_CAUSE_PHY_STATUS_CHANGE |
+                   MVNETA_CAUSE_LINK_CHANGE |
+                   MVNETA_CAUSE_PSC_SYNC_CHANGE);
+       netif_tx_start_all_queues(pp->dev);
+       return 0;
 }
 
 static int mvneta_open(struct net_device *dev)
@@ -3444,7 +3449,15 @@ static int mvneta_open(struct net_device *dev)
        /* Register a CPU notifier to handle the case where our CPU
         * might be taken offline.
         */
-       register_cpu_notifier(&pp->cpu_notifier);
+       ret = cpuhp_state_add_instance_nocalls(online_hpstate,
+                                              &pp->node_online);
+       if (ret)
+               goto err_free_irq;
+
+       ret = cpuhp_state_add_instance_nocalls(CPUHP_NET_MVNETA_DEAD,
+                                              &pp->node_dead);
+       if (ret)
+               goto err_free_online_hp;
 
        /* In default link is down */
        netif_carrier_off(pp->dev);
@@ -3452,15 +3465,19 @@ static int mvneta_open(struct net_device *dev)
        ret = mvneta_mdio_probe(pp);
        if (ret < 0) {
                netdev_err(dev, "cannot probe MDIO bus\n");
-               goto err_free_irq;
+               goto err_free_dead_hp;
        }
 
        mvneta_start_dev(pp);
 
        return 0;
 
+err_free_dead_hp:
+       cpuhp_state_remove_instance_nocalls(CPUHP_NET_MVNETA_DEAD,
+                                           &pp->node_dead);
+err_free_online_hp:
+       cpuhp_state_remove_instance_nocalls(online_hpstate, &pp->node_online);
 err_free_irq:
-       unregister_cpu_notifier(&pp->cpu_notifier);
        on_each_cpu(mvneta_percpu_disable, pp, true);
        free_percpu_irq(pp->dev->irq, pp->ports);
 err_cleanup_txqs:
@@ -3486,7 +3503,10 @@ static int mvneta_stop(struct net_device *dev)
 
        mvneta_stop_dev(pp);
        mvneta_mdio_remove(pp);
-       unregister_cpu_notifier(&pp->cpu_notifier);
+
+       cpuhp_state_remove_instance_nocalls(online_hpstate, &pp->node_online);
+       cpuhp_state_remove_instance_nocalls(CPUHP_NET_MVNETA_DEAD,
+                                           &pp->node_dead);
        on_each_cpu(mvneta_percpu_disable, pp, true);
        free_percpu_irq(dev->irq, pp->ports);
        mvneta_cleanup_rxqs(pp);
@@ -4014,7 +4034,6 @@ static int mvneta_probe(struct platform_device *pdev)
        err = of_property_read_string(dn, "managed", &managed);
        pp->use_inband_status = (err == 0 &&
                                 strcmp(managed, "in-band-status") == 0);
-       pp->cpu_notifier.notifier_call = mvneta_percpu_notifier;
 
        pp->rxq_def = rxq_def;
 
@@ -4217,7 +4236,42 @@ static struct platform_driver mvneta_driver = {
        },
 };
 
-module_platform_driver(mvneta_driver);
+static int __init mvneta_driver_init(void)
+{
+       int ret;
+
+       ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "net/mvmeta:online",
+                                     mvneta_cpu_online,
+                                     mvneta_cpu_down_prepare);
+       if (ret < 0)
+               goto out;
+       online_hpstate = ret;
+       ret = cpuhp_setup_state_multi(CPUHP_NET_MVNETA_DEAD, "net/mvneta:dead",
+                                     NULL, mvneta_cpu_dead);
+       if (ret)
+               goto err_dead;
+
+       ret = platform_driver_register(&mvneta_driver);
+       if (ret)
+               goto err;
+       return 0;
+
+err:
+       cpuhp_remove_multi_state(CPUHP_NET_MVNETA_DEAD);
+err_dead:
+       cpuhp_remove_multi_state(online_hpstate);
+out:
+       return ret;
+}
+module_init(mvneta_driver_init);
+
+static void __exit mvneta_driver_exit(void)
+{
+       platform_driver_unregister(&mvneta_driver);
+       cpuhp_remove_multi_state(CPUHP_NET_MVNETA_DEAD);
+       cpuhp_remove_multi_state(online_hpstate);
+}
+module_exit(mvneta_driver_exit);
 
 MODULE_DESCRIPTION("Marvell NETA Ethernet Driver - www.marvell.com");
 MODULE_AUTHOR("Rami Rosen <rosenr@marvell.com>, Thomas Petazzoni <thomas.petazzoni@free-electrons.com>");
index 1b5f531eeb2549483e8a189bcde259a902dd3ca8..fad84f3f41099dce6fb62399b0df9ac6be857318 100644 (file)
@@ -138,8 +138,9 @@ struct virtnet_info {
        /* Does the affinity hint is set for virtqueues? */
        bool affinity_hint_set;
 
-       /* CPU hot plug notifier */
-       struct notifier_block nb;
+       /* CPU hotplug instances for online & dead */
+       struct hlist_node node;
+       struct hlist_node node_dead;
 
        /* Control VQ buffers: protected by the rtnl lock */
        struct virtio_net_ctrl_hdr ctrl_hdr;
@@ -1237,25 +1238,53 @@ static void virtnet_set_affinity(struct virtnet_info *vi)
        vi->affinity_hint_set = true;
 }
 
-static int virtnet_cpu_callback(struct notifier_block *nfb,
-                               unsigned long action, void *hcpu)
+static int virtnet_cpu_online(unsigned int cpu, struct hlist_node *node)
 {
-       struct virtnet_info *vi = container_of(nfb, struct virtnet_info, nb);
+       struct virtnet_info *vi = hlist_entry_safe(node, struct virtnet_info,
+                                                  node);
+       virtnet_set_affinity(vi);
+       return 0;
+}
 
-       switch(action & ~CPU_TASKS_FROZEN) {
-       case CPU_ONLINE:
-       case CPU_DOWN_FAILED:
-       case CPU_DEAD:
-               virtnet_set_affinity(vi);
-               break;
-       case CPU_DOWN_PREPARE:
-               virtnet_clean_affinity(vi, (long)hcpu);
-               break;
-       default:
-               break;
-       }
+static int virtnet_cpu_dead(unsigned int cpu, struct hlist_node *node)
+{
+       struct virtnet_info *vi = hlist_entry_safe(node, struct virtnet_info,
+                                                  node_dead);
+       virtnet_set_affinity(vi);
+       return 0;
+}
 
-       return NOTIFY_OK;
+static int virtnet_cpu_down_prep(unsigned int cpu, struct hlist_node *node)
+{
+       struct virtnet_info *vi = hlist_entry_safe(node, struct virtnet_info,
+                                                  node);
+
+       virtnet_clean_affinity(vi, cpu);
+       return 0;
+}
+
+static enum cpuhp_state virtionet_online;
+
+static int virtnet_cpu_notif_add(struct virtnet_info *vi)
+{
+       int ret;
+
+       ret = cpuhp_state_add_instance_nocalls(virtionet_online, &vi->node);
+       if (ret)
+               return ret;
+       ret = cpuhp_state_add_instance_nocalls(CPUHP_VIRT_NET_DEAD,
+                                              &vi->node_dead);
+       if (!ret)
+               return ret;
+       cpuhp_state_remove_instance_nocalls(virtionet_online, &vi->node);
+       return ret;
+}
+
+static void virtnet_cpu_notif_remove(struct virtnet_info *vi)
+{
+       cpuhp_state_remove_instance_nocalls(virtionet_online, &vi->node);
+       cpuhp_state_remove_instance_nocalls(CPUHP_VIRT_NET_DEAD,
+                                           &vi->node_dead);
 }
 
 static void virtnet_get_ringparam(struct net_device *dev,
@@ -1879,8 +1908,7 @@ static int virtnet_probe(struct virtio_device *vdev)
 
        virtio_device_ready(vdev);
 
-       vi->nb.notifier_call = &virtnet_cpu_callback;
-       err = register_hotcpu_notifier(&vi->nb);
+       err = virtnet_cpu_notif_add(vi);
        if (err) {
                pr_debug("virtio_net: registering cpu notifier failed\n");
                goto free_unregister_netdev;
@@ -1934,7 +1962,7 @@ static void virtnet_remove(struct virtio_device *vdev)
 {
        struct virtnet_info *vi = vdev->priv;
 
-       unregister_hotcpu_notifier(&vi->nb);
+       virtnet_cpu_notif_remove(vi);
 
        /* Make sure no work handler is accessing the device. */
        flush_work(&vi->config_work);
@@ -1953,7 +1981,7 @@ static int virtnet_freeze(struct virtio_device *vdev)
        struct virtnet_info *vi = vdev->priv;
        int i;
 
-       unregister_hotcpu_notifier(&vi->nb);
+       virtnet_cpu_notif_remove(vi);
 
        /* Make sure no work handler is accessing the device */
        flush_work(&vi->config_work);
@@ -1997,7 +2025,7 @@ static int virtnet_restore(struct virtio_device *vdev)
        virtnet_set_queues(vi, vi->curr_queue_pairs);
        rtnl_unlock();
 
-       err = register_hotcpu_notifier(&vi->nb);
+       err = virtnet_cpu_notif_add(vi);
        if (err)
                return err;
 
@@ -2039,7 +2067,41 @@ static struct virtio_driver virtio_net_driver = {
 #endif
 };
 
-module_virtio_driver(virtio_net_driver);
+static __init int virtio_net_driver_init(void)
+{
+       int ret;
+
+       ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "AP_VIRT_NET_ONLINE",
+                                     virtnet_cpu_online,
+                                     virtnet_cpu_down_prep);
+       if (ret < 0)
+               goto out;
+       virtionet_online = ret;
+       ret = cpuhp_setup_state_multi(CPUHP_VIRT_NET_DEAD, "VIRT_NET_DEAD",
+                                     NULL, virtnet_cpu_dead);
+       if (ret)
+               goto err_dead;
+
+        ret = register_virtio_driver(&virtio_net_driver);
+       if (ret)
+               goto err_virtio;
+       return 0;
+err_virtio:
+       cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD);
+err_dead:
+       cpuhp_remove_multi_state(virtionet_online);
+out:
+       return ret;
+}
+module_init(virtio_net_driver_init);
+
+static __exit void virtio_net_driver_exit(void)
+{
+       cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD);
+       cpuhp_remove_multi_state(virtionet_online);
+       unregister_virtio_driver(&virtio_net_driver);
+}
+module_exit(virtio_net_driver_exit);
 
 MODULE_DEVICE_TABLE(virtio, id_table);
 MODULE_DESCRIPTION("Virtio network driver");
index c878aa71173b9122165aad37e52d5e88af419296..55f453de562ee63b8e53ab90cf6b671ef676c2a8 100644 (file)
@@ -60,8 +60,13 @@ static struct pci_platform_pm_ops mid_pci_platform_pm = {
 
 #define ICPU(model)    { X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, }
 
+/*
+ * This table should be in sync with the one in
+ * arch/x86/platform/intel-mid/pwr.c.
+ */
 static const struct x86_cpu_id lpss_cpu_ids[] = {
-       ICPU(INTEL_FAM6_ATOM_MERRIFIELD1),
+       ICPU(INTEL_FAM6_ATOM_PENWELL),
+       ICPU(INTEL_FAM6_ATOM_MERRIFIELD),
        {}
 };
 
index c74059e10a6de9d1afe89cea272c4146bc183e68..f30ca75b5b6c7e20055e881776c44eaf50acbe0a 100644 (file)
@@ -25,6 +25,7 @@
 #include <linux/ioport.h>
 #include <linux/cache.h>
 #include <linux/slab.h>
+#include <linux/acpi.h>
 #include "pci.h"
 
 unsigned int pci_flags;
@@ -1852,8 +1853,13 @@ void __init pci_assign_unassigned_resources(void)
 {
        struct pci_bus *root_bus;
 
-       list_for_each_entry(root_bus, &pci_root_buses, node)
+       list_for_each_entry(root_bus, &pci_root_buses, node) {
                pci_assign_unassigned_root_bus_resources(root_bus);
+
+               /* Make sure the root bridge has a companion ACPI device: */
+               if (ACPI_HANDLE(root_bus->bridge))
+                       acpi_ioapic_add(ACPI_HANDLE(root_bus->bridge));
+       }
 }
 
 void pci_assign_unassigned_bridge_resources(struct pci_dev *bridge)
index 56ad3b03af8a5caf5cd6a7f2f8b55d606ed79581..278122e0eaffaea6d7e10fb341d0fac8c778c782 100644 (file)
@@ -709,28 +709,20 @@ static int cpu_pmu_request_irq(struct arm_pmu *cpu_pmu, irq_handler_t handler)
        return 0;
 }
 
-static DEFINE_SPINLOCK(arm_pmu_lock);
-static LIST_HEAD(arm_pmu_list);
-
 /*
  * PMU hardware loses all context when a CPU goes offline.
  * When a CPU is hotplugged back in, since some hardware registers are
  * UNKNOWN at reset, the PMU must be explicitly reset to avoid reading
  * junk values out of them.
  */
-static int arm_perf_starting_cpu(unsigned int cpu)
+static int arm_perf_starting_cpu(unsigned int cpu, struct hlist_node *node)
 {
-       struct arm_pmu *pmu;
-
-       spin_lock(&arm_pmu_lock);
-       list_for_each_entry(pmu, &arm_pmu_list, entry) {
+       struct arm_pmu *pmu = hlist_entry_safe(node, struct arm_pmu, node);
 
-               if (!cpumask_test_cpu(cpu, &pmu->supported_cpus))
-                       continue;
-               if (pmu->reset)
-                       pmu->reset(pmu);
-       }
-       spin_unlock(&arm_pmu_lock);
+       if (!cpumask_test_cpu(cpu, &pmu->supported_cpus))
+               return 0;
+       if (pmu->reset)
+               pmu->reset(pmu);
        return 0;
 }
 
@@ -842,9 +834,10 @@ static int cpu_pmu_init(struct arm_pmu *cpu_pmu)
        if (!cpu_hw_events)
                return -ENOMEM;
 
-       spin_lock(&arm_pmu_lock);
-       list_add_tail(&cpu_pmu->entry, &arm_pmu_list);
-       spin_unlock(&arm_pmu_lock);
+       err = cpuhp_state_add_instance_nocalls(CPUHP_AP_PERF_ARM_STARTING,
+                                              &cpu_pmu->node);
+       if (err)
+               goto out_free;
 
        err = cpu_pm_pmu_register(cpu_pmu);
        if (err)
@@ -880,9 +873,9 @@ static int cpu_pmu_init(struct arm_pmu *cpu_pmu)
        return 0;
 
 out_unregister:
-       spin_lock(&arm_pmu_lock);
-       list_del(&cpu_pmu->entry);
-       spin_unlock(&arm_pmu_lock);
+       cpuhp_state_remove_instance_nocalls(CPUHP_AP_PERF_ARM_STARTING,
+                                           &cpu_pmu->node);
+out_free:
        free_percpu(cpu_hw_events);
        return err;
 }
@@ -890,9 +883,8 @@ out_unregister:
 static void cpu_pmu_destroy(struct arm_pmu *cpu_pmu)
 {
        cpu_pm_pmu_unregister(cpu_pmu);
-       spin_lock(&arm_pmu_lock);
-       list_del(&cpu_pmu->entry);
-       spin_unlock(&arm_pmu_lock);
+       cpuhp_state_remove_instance_nocalls(CPUHP_AP_PERF_ARM_STARTING,
+                                           &cpu_pmu->node);
        free_percpu(cpu_pmu->hw_events);
 }
 
@@ -1090,9 +1082,9 @@ static int arm_pmu_hp_init(void)
 {
        int ret;
 
-       ret = cpuhp_setup_state_nocalls(CPUHP_AP_PERF_ARM_STARTING,
-                                       "AP_PERF_ARM_STARTING",
-                                       arm_perf_starting_cpu, NULL);
+       ret = cpuhp_setup_state_multi(CPUHP_AP_PERF_ARM_STARTING,
+                                     "AP_PERF_ARM_STARTING",
+                                     arm_perf_starting_cpu, NULL);
        if (ret)
                pr_err("CPU hotplug notifier for ARM PMU could not be registered: %d\n",
                       ret);
index fbab29dfa7937cb615d3259634655a48052c58e5..243b233ff31bff9fdcfae1ab019b3104fd36916e 100644 (file)
@@ -1154,8 +1154,8 @@ static const struct x86_cpu_id rapl_ids[] __initconst = {
 
        RAPL_CPU(INTEL_FAM6_ATOM_SILVERMONT1,   rapl_defaults_byt),
        RAPL_CPU(INTEL_FAM6_ATOM_AIRMONT,       rapl_defaults_cht),
-       RAPL_CPU(INTEL_FAM6_ATOM_MERRIFIELD1,   rapl_defaults_tng),
-       RAPL_CPU(INTEL_FAM6_ATOM_MERRIFIELD2,   rapl_defaults_ann),
+       RAPL_CPU(INTEL_FAM6_ATOM_MERRIFIELD   rapl_defaults_tng),
+       RAPL_CPU(INTEL_FAM6_ATOM_MOOREFIELD,    rapl_defaults_ann),
        RAPL_CPU(INTEL_FAM6_ATOM_GOLDMONT,      rapl_defaults_core),
        RAPL_CPU(INTEL_FAM6_ATOM_DENVERTON,     rapl_defaults_core),
 
index ac0df4dde823866b54480c7062d5188887009365..3b792ab3c0dc17d47b42643ec9a9807f663a229c 100644 (file)
@@ -483,7 +483,7 @@ static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns,
                save_stack_trace_tsk(task, &trace);
 
                for (i = 0; i < trace.nr_entries; i++) {
-                       seq_printf(m, "[<%pK>] %pS\n",
+                       seq_printf(m, "[<%pK>] %pB\n",
                                   (void *)entries[i], (void *)entries[i]);
                }
                unlock_trace(task);
index e7465528b8dcfe52de57663cfb5aedb3b8ed229c..6d0816ba84d880af4013c25c46db35f2a0dce1d6 100644 (file)
@@ -758,6 +758,12 @@ static inline int acpi_reconfig_notifier_unregister(struct notifier_block *nb)
 
 #endif /* !CONFIG_ACPI */
 
+#ifdef CONFIG_ACPI_HOTPLUG_IOAPIC
+int acpi_ioapic_add(acpi_handle root);
+#else
+static inline int acpi_ioapic_add(acpi_handle root) { return 0; }
+#endif
+
 #ifdef CONFIG_ACPI
 void acpi_os_set_prepare_sleep(int (*func)(u8 sleep_state,
                               u32 pm1a_ctrl,  u32 pm1b_ctrl));
index 598bc999f4c2860969771b77d67872348f1f87f1..3b77588a93602eecd5d73d30d6fd77377286f07e 100644 (file)
@@ -339,6 +339,24 @@ static inline int bitmap_parse(const char *buf, unsigned int buflen,
        return __bitmap_parse(buf, buflen, 0, maskp, nmaskbits);
 }
 
+/*
+ * bitmap_from_u64 - Check and swap words within u64.
+ *  @mask: source bitmap
+ *  @dst:  destination bitmap
+ *
+ * In 32-bit Big Endian kernel, when using (u32 *)(&val)[*]
+ * to read u64 mask, we will get the wrong word.
+ * That is "(u32 *)(&val)[0]" gets the upper 32 bits,
+ * but we expect the lower 32-bits of u64.
+ */
+static inline void bitmap_from_u64(unsigned long *dst, u64 mask)
+{
+       dst[0] = mask & ULONG_MAX;
+
+       if (sizeof(mask) > sizeof(unsigned long))
+               dst[1] = mask >> 32;
+}
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* __LINUX_BITMAP_H */
index 0e0faff63bd2d7e98c5a0eead16f2aa445d9afd3..cf0fa5d86059b6672773025b625027f6301074c1 100644 (file)
@@ -554,13 +554,14 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
  * object's lifetime is managed by something other than RCU.  That
  * "something other" might be reference counting or simple immortality.
  *
- * The seemingly unused size_t variable is to validate @p is indeed a pointer
- * type by making sure it can be dereferenced.
+ * The seemingly unused variable ___typecheck_p validates that @p is
+ * indeed a pointer type by using a pointer to typeof(*p) as the type.
+ * Taking a pointer to typeof(*p) again is needed in case p is void *.
  */
 #define lockless_dereference(p) \
 ({ \
        typeof(p) _________p1 = READ_ONCE(p); \
-       size_t __maybe_unused __size_of_ptr = sizeof(*(p)); \
+       typeof(*(p)) *___typecheck_p __maybe_unused; \
        smp_read_barrier_depends(); /* Dependency order vs. p above. */ \
        (_________p1); \
 })
index ad4f1f33a74e93050cb110fdf6f8f512c0c11499..7572d9e9dced921e1732226a2c00f5802f58c735 100644 (file)
@@ -61,17 +61,8 @@ struct notifier_block;
 #define CPU_DOWN_PREPARE       0x0005 /* CPU (unsigned)v going down */
 #define CPU_DOWN_FAILED                0x0006 /* CPU (unsigned)v NOT going down */
 #define CPU_DEAD               0x0007 /* CPU (unsigned)v dead */
-#define CPU_DYING              0x0008 /* CPU (unsigned)v not running any task,
-                                       * not handling interrupts, soon dead.
-                                       * Called on the dying cpu, interrupts
-                                       * are already disabled. Must not
-                                       * sleep, must not fail */
 #define CPU_POST_DEAD          0x0009 /* CPU (unsigned)v dead, cpu_hotplug
                                        * lock is dropped */
-#define CPU_STARTING           0x000A /* CPU (unsigned)v soon running.
-                                       * Called on the new cpu, just before
-                                       * enabling interrupts. Must not sleep,
-                                       * must not fail */
 #define CPU_BROKEN             0x000B /* CPU (unsigned)v did not die properly,
                                        * perhaps due to preemption. */
 
@@ -86,9 +77,6 @@ struct notifier_block;
 #define CPU_DOWN_PREPARE_FROZEN        (CPU_DOWN_PREPARE | CPU_TASKS_FROZEN)
 #define CPU_DOWN_FAILED_FROZEN (CPU_DOWN_FAILED | CPU_TASKS_FROZEN)
 #define CPU_DEAD_FROZEN                (CPU_DEAD | CPU_TASKS_FROZEN)
-#define CPU_DYING_FROZEN       (CPU_DYING | CPU_TASKS_FROZEN)
-#define CPU_STARTING_FROZEN    (CPU_STARTING | CPU_TASKS_FROZEN)
-
 
 #ifdef CONFIG_SMP
 extern bool cpuhp_tasks_frozen;
index 9177041fed2c047a118333553d63e9289f545189..211c1287e7d6f157f6956f89da5b285003179ed5 100644 (file)
@@ -14,13 +14,25 @@ enum cpuhp_state {
        CPUHP_PERF_SUPERH,
        CPUHP_X86_HPET_DEAD,
        CPUHP_X86_APB_DEAD,
+       CPUHP_VIRT_NET_DEAD,
+       CPUHP_SLUB_DEAD,
+       CPUHP_MM_WRITEBACK_DEAD,
+       CPUHP_SOFTIRQ_DEAD,
+       CPUHP_NET_MVNETA_DEAD,
+       CPUHP_CPUIDLE_DEAD,
        CPUHP_WORKQUEUE_PREP,
        CPUHP_POWER_NUMA_PREPARE,
        CPUHP_HRTIMERS_PREPARE,
        CPUHP_PROFILE_PREPARE,
        CPUHP_X2APIC_PREPARE,
        CPUHP_SMPCFD_PREPARE,
+       CPUHP_RELAY_PREPARE,
+       CPUHP_SLAB_PREPARE,
+       CPUHP_MD_RAID5_PREPARE,
        CPUHP_RCUTREE_PREP,
+       CPUHP_CPUIDLE_COUPLED_PREPARE,
+       CPUHP_POWERPC_PMAC_PREPARE,
+       CPUHP_POWERPC_MMU_CTX_PREPARE,
        CPUHP_NOTIFY_PREPARE,
        CPUHP_TIMERS_DEAD,
        CPUHP_BRINGUP_CPU,
@@ -70,7 +82,6 @@ enum cpuhp_state {
        CPUHP_AP_ARM64_ISNDEP_STARTING,
        CPUHP_AP_SMPCFD_DYING,
        CPUHP_AP_X86_TBOOT_DYING,
-       CPUHP_AP_NOTIFY_STARTING,
        CPUHP_AP_ONLINE,
        CPUHP_TEARDOWN_CPU,
        CPUHP_AP_ONLINE_IDLE,
@@ -102,7 +113,7 @@ enum cpuhp_state {
 
 int __cpuhp_setup_state(enum cpuhp_state state,        const char *name, bool invoke,
                        int (*startup)(unsigned int cpu),
-                       int (*teardown)(unsigned int cpu));
+                       int (*teardown)(unsigned int cpu), bool multi_instance);
 
 /**
  * cpuhp_setup_state - Setup hotplug state callbacks with calling the callbacks
@@ -119,7 +130,7 @@ static inline int cpuhp_setup_state(enum cpuhp_state state,
                                    int (*startup)(unsigned int cpu),
                                    int (*teardown)(unsigned int cpu))
 {
-       return __cpuhp_setup_state(state, name, true, startup, teardown);
+       return __cpuhp_setup_state(state, name, true, startup, teardown, false);
 }
 
 /**
@@ -138,7 +149,66 @@ static inline int cpuhp_setup_state_nocalls(enum cpuhp_state state,
                                            int (*startup)(unsigned int cpu),
                                            int (*teardown)(unsigned int cpu))
 {
-       return __cpuhp_setup_state(state, name, false, startup, teardown);
+       return __cpuhp_setup_state(state, name, false, startup, teardown,
+                                  false);
+}
+
+/**
+ * cpuhp_setup_state_multi - Add callbacks for multi state
+ * @state:     The state for which the calls are installed
+ * @name:      Name of the callback.
+ * @startup:   startup callback function
+ * @teardown:  teardown callback function
+ *
+ * Sets the internal multi_instance flag and prepares a state to work as a multi
+ * instance callback. No callbacks are invoked at this point. The callbacks are
+ * invoked once an instance for this state are registered via
+ * @cpuhp_state_add_instance or @cpuhp_state_add_instance_nocalls.
+ */
+static inline int cpuhp_setup_state_multi(enum cpuhp_state state,
+                                         const char *name,
+                                         int (*startup)(unsigned int cpu,
+                                                        struct hlist_node *node),
+                                         int (*teardown)(unsigned int cpu,
+                                                         struct hlist_node *node))
+{
+       return __cpuhp_setup_state(state, name, false,
+                                  (void *) startup,
+                                  (void *) teardown, true);
+}
+
+int __cpuhp_state_add_instance(enum cpuhp_state state, struct hlist_node *node,
+                              bool invoke);
+
+/**
+ * cpuhp_state_add_instance - Add an instance for a state and invoke startup
+ *                            callback.
+ * @state:     The state for which the instance is installed
+ * @node:      The node for this individual state.
+ *
+ * Installs the instance for the @state and invokes the startup callback on
+ * the present cpus which have already reached the @state. The @state must have
+ * been earlier marked as multi-instance by @cpuhp_setup_state_multi.
+ */
+static inline int cpuhp_state_add_instance(enum cpuhp_state state,
+                                          struct hlist_node *node)
+{
+       return __cpuhp_state_add_instance(state, node, true);
+}
+
+/**
+ * cpuhp_state_add_instance_nocalls - Add an instance for a state without
+ *                                    invoking the startup callback.
+ * @state:     The state for which the instance is installed
+ * @node:      The node for this individual state.
+ *
+ * Installs the instance for the @state The @state must have been earlier
+ * marked as multi-instance by @cpuhp_setup_state_multi.
+ */
+static inline int cpuhp_state_add_instance_nocalls(enum cpuhp_state state,
+                                                  struct hlist_node *node)
+{
+       return __cpuhp_state_add_instance(state, node, false);
 }
 
 void __cpuhp_remove_state(enum cpuhp_state state, bool invoke);
@@ -165,6 +235,51 @@ static inline void cpuhp_remove_state_nocalls(enum cpuhp_state state)
        __cpuhp_remove_state(state, false);
 }
 
+/**
+ * cpuhp_remove_multi_state - Remove hotplug multi state callback
+ * @state:     The state for which the calls are removed
+ *
+ * Removes the callback functions from a multi state. This is the reverse of
+ * cpuhp_setup_state_multi(). All instances should have been removed before
+ * invoking this function.
+ */
+static inline void cpuhp_remove_multi_state(enum cpuhp_state state)
+{
+       __cpuhp_remove_state(state, false);
+}
+
+int __cpuhp_state_remove_instance(enum cpuhp_state state,
+                                 struct hlist_node *node, bool invoke);
+
+/**
+ * cpuhp_state_remove_instance - Remove hotplug instance from state and invoke
+ *                               the teardown callback
+ * @state:     The state from which the instance is removed
+ * @node:      The node for this individual state.
+ *
+ * Removes the instance and invokes the teardown callback on the present cpus
+ * which have already reached the @state.
+ */
+static inline int cpuhp_state_remove_instance(enum cpuhp_state state,
+                                             struct hlist_node *node)
+{
+       return __cpuhp_state_remove_instance(state, node, true);
+}
+
+/**
+ * cpuhp_state_remove_instance_nocalls - Remove hotplug instance from state
+ *                                      without invoking the reatdown callback
+ * @state:     The state from which the instance is removed
+ * @node:      The node for this individual state.
+ *
+ * Removes the instance without invoking the teardown callback.
+ */
+static inline int cpuhp_state_remove_instance_nocalls(enum cpuhp_state state,
+                                                     struct hlist_node *node)
+{
+       return __cpuhp_state_remove_instance(state, node, false);
+}
+
 #ifdef CONFIG_SMP
 void cpuhp_online_idle(enum cpuhp_state state);
 #else
index 7f5a5822538539848450f7d55c94ceee50d6c0bc..0148a3046b4864733200d76a3a45ff26e38c1523 100644 (file)
@@ -118,6 +118,15 @@ typedef struct {
        u32 imagesize;
 } efi_capsule_header_t;
 
+struct efi_boot_memmap {
+       efi_memory_desc_t       **map;
+       unsigned long           *map_size;
+       unsigned long           *desc_size;
+       u32                     *desc_ver;
+       unsigned long           *key_ptr;
+       unsigned long           *buff_size;
+};
+
 /*
  * EFI capsule flags
  */
@@ -946,7 +955,7 @@ extern int efi_memattr_apply_permissions(struct mm_struct *mm,
 /* Iterate through an efi_memory_map */
 #define for_each_efi_memory_desc_in_map(m, md)                            \
        for ((md) = (m)->map;                                              \
-            ((void *)(md) + (m)->desc_size) <= (m)->map_end;              \
+            (md) && ((void *)(md) + (m)->desc_size) <= (m)->map_end;      \
             (md) = (void *)(md) + (m)->desc_size)
 
 /**
@@ -1371,11 +1380,7 @@ char *efi_convert_cmdline(efi_system_table_t *sys_table_arg,
                          efi_loaded_image_t *image, int *cmd_line_len);
 
 efi_status_t efi_get_memory_map(efi_system_table_t *sys_table_arg,
-                               efi_memory_desc_t **map,
-                               unsigned long *map_size,
-                               unsigned long *desc_size,
-                               u32 *desc_ver,
-                               unsigned long *key_ptr);
+                               struct efi_boot_memmap *map);
 
 efi_status_t efi_low_alloc(efi_system_table_t *sys_table_arg,
                           unsigned long size, unsigned long align,
@@ -1457,4 +1462,14 @@ extern void efi_call_virt_check_flags(unsigned long flags, const char *call);
        arch_efi_call_virt_teardown();                                  \
 })
 
+typedef efi_status_t (*efi_exit_boot_map_processing)(
+       efi_system_table_t *sys_table_arg,
+       struct efi_boot_memmap *map,
+       void *priv);
+
+efi_status_t efi_exit_boot_services(efi_system_table_t *sys_table,
+                                   void *handle,
+                                   struct efi_boot_memmap *map,
+                                   void *priv,
+                                   efi_exit_boot_map_processing priv_func);
 #endif /* _LINUX_EFI_H */
index 7d565afe35d2fa9ba25359c9bf879d8bde9893e7..6f93ac46e7f0a52f6091755ebf879f14dc80c525 100644 (file)
@@ -795,7 +795,12 @@ struct ftrace_ret_stack {
        unsigned long func;
        unsigned long long calltime;
        unsigned long long subtime;
+#ifdef HAVE_FUNCTION_GRAPH_FP_TEST
        unsigned long fp;
+#endif
+#ifdef HAVE_FUNCTION_GRAPH_RET_ADDR_PTR
+       unsigned long *retp;
+#endif
 };
 
 /*
@@ -807,7 +812,10 @@ extern void return_to_handler(void);
 
 extern int
 ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth,
-                        unsigned long frame_pointer);
+                        unsigned long frame_pointer, unsigned long *retp);
+
+unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx,
+                                   unsigned long ret, unsigned long *retp);
 
 /*
  * Sometimes we don't want to trace a function with the function
@@ -870,6 +878,13 @@ static inline int task_curr_ret_stack(struct task_struct *tsk)
        return -1;
 }
 
+static inline unsigned long
+ftrace_graph_ret_addr(struct task_struct *task, int *idx, unsigned long ret,
+                     unsigned long *retp)
+{
+       return ret;
+}
+
 static inline void pause_graph_tracing(void) { }
 static inline void unpause_graph_tracing(void) { }
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
diff --git a/include/linux/hypervisor.h b/include/linux/hypervisor.h
new file mode 100644 (file)
index 0000000..3fa5ef2
--- /dev/null
@@ -0,0 +1,17 @@
+#ifndef __LINUX_HYPEVISOR_H
+#define __LINUX_HYPEVISOR_H
+
+/*
+ *     Generic Hypervisor support
+ *             Juergen Gross <jgross@suse.com>
+ */
+
+#ifdef CONFIG_HYPERVISOR_GUEST
+#include <asm/hypervisor.h>
+#else
+static inline void hypervisor_pin_vcpu(int cpu)
+{
+}
+#endif
+
+#endif /* __LINUX_HYPEVISOR_H */
index b52424eaa0ed38e932d174f6a950a488d823399b..603986741f2ccac2c5b955e082429e8f0aab7dc4 100644 (file)
@@ -916,12 +916,20 @@ void irq_remove_generic_chip(struct irq_chip_generic *gc, u32 msk,
                             unsigned int clr, unsigned int set);
 
 struct irq_chip_generic *irq_get_domain_generic_chip(struct irq_domain *d, unsigned int hw_irq);
-int irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip,
-                                  int num_ct, const char *name,
-                                  irq_flow_handler_t handler,
-                                  unsigned int clr, unsigned int set,
-                                  enum irq_gc_flags flags);
 
+int __irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip,
+                                    int num_ct, const char *name,
+                                    irq_flow_handler_t handler,
+                                    unsigned int clr, unsigned int set,
+                                    enum irq_gc_flags flags);
+
+#define irq_alloc_domain_generic_chips(d, irqs_per_chip, num_ct, name, \
+                                      handler, clr, set, flags)        \
+({                                                                     \
+       MAYBE_BUILD_BUG_ON(irqs_per_chip > 32);                         \
+       __irq_alloc_domain_generic_chips(d, irqs_per_chip, num_ct, name,\
+                                        handler, clr, set, flags);     \
+})
 
 static inline struct irq_chip_type *irq_data_get_chip_type(struct irq_data *d)
 {
index a534c7f15a615520d802a78a82fe55d2af8e4cae..a0547c571800e7ca77f9a7ffed9412a16bee9262 100644 (file)
@@ -269,9 +269,15 @@ struct static_key_false {
 #define DEFINE_STATIC_KEY_TRUE(name)   \
        struct static_key_true name = STATIC_KEY_TRUE_INIT
 
+#define DECLARE_STATIC_KEY_TRUE(name)  \
+       extern struct static_key_true name
+
 #define DEFINE_STATIC_KEY_FALSE(name)  \
        struct static_key_false name = STATIC_KEY_FALSE_INIT
 
+#define DECLARE_STATIC_KEY_FALSE(name) \
+       extern struct static_key_false name
+
 #define DEFINE_STATIC_KEY_ARRAY_TRUE(name, count)              \
        struct static_key_true name[count] = {                  \
                [0 ... (count) - 1] = STATIC_KEY_TRUE_INIT,     \
index 2b6a204bd8d40cfb74db80e19bec216795366e33..3ffc69ebe967effe57608ee9c20dd5458e7ab27b 100644 (file)
@@ -63,6 +63,13 @@ static inline ktime_t ktime_set(const s64 secs, const unsigned long nsecs)
 #define ktime_add(lhs, rhs) \
                ({ (ktime_t){ .tv64 = (lhs).tv64 + (rhs).tv64 }; })
 
+/*
+ * Same as ktime_add(), but avoids undefined behaviour on overflow; however,
+ * this means that you must check the result for overflow yourself.
+ */
+#define ktime_add_unsafe(lhs, rhs) \
+               ({ (ktime_t){ .tv64 = (u64) (lhs).tv64 + (rhs).tv64 }; })
+
 /*
  * Add a ktime_t variable and a scalar nanosecond value.
  * res = kt + nsval:
index c2fa3ecb0dce57dde9ad4a92a35dc4178d7a159e..146efefde2a157008fce62fcdaa376f13e2c7682 100644 (file)
 
 struct percpu_rw_semaphore {
        struct rcu_sync         rss;
-       unsigned int __percpu   *fast_read_ctr;
+       unsigned int __percpu   *read_count;
        struct rw_semaphore     rw_sem;
-       atomic_t                slow_read_ctr;
-       wait_queue_head_t       write_waitq;
+       wait_queue_head_t       writer;
+       int                     readers_block;
 };
 
-extern void percpu_down_read(struct percpu_rw_semaphore *);
-extern int  percpu_down_read_trylock(struct percpu_rw_semaphore *);
-extern void percpu_up_read(struct percpu_rw_semaphore *);
+extern int __percpu_down_read(struct percpu_rw_semaphore *, int);
+extern void __percpu_up_read(struct percpu_rw_semaphore *);
+
+static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
+{
+       might_sleep();
+
+       rwsem_acquire_read(&sem->rw_sem.dep_map, 0, 0, _RET_IP_);
+
+       preempt_disable();
+       /*
+        * We are in an RCU-sched read-side critical section, so the writer
+        * cannot both change sem->state from readers_fast and start checking
+        * counters while we are here. So if we see !sem->state, we know that
+        * the writer won't be checking until we're past the preempt_enable()
+        * and that one the synchronize_sched() is done, the writer will see
+        * anything we did within this RCU-sched read-size critical section.
+        */
+       __this_cpu_inc(*sem->read_count);
+       if (unlikely(!rcu_sync_is_idle(&sem->rss)))
+               __percpu_down_read(sem, false); /* Unconditional memory barrier */
+       preempt_enable();
+       /*
+        * The barrier() from preempt_enable() prevents the compiler from
+        * bleeding the critical section out.
+        */
+}
+
+static inline int percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
+{
+       int ret = 1;
+
+       preempt_disable();
+       /*
+        * Same as in percpu_down_read().
+        */
+       __this_cpu_inc(*sem->read_count);
+       if (unlikely(!rcu_sync_is_idle(&sem->rss)))
+               ret = __percpu_down_read(sem, true); /* Unconditional memory barrier */
+       preempt_enable();
+       /*
+        * The barrier() from preempt_enable() prevents the compiler from
+        * bleeding the critical section out.
+        */
+
+       if (ret)
+               rwsem_acquire_read(&sem->rw_sem.dep_map, 0, 1, _RET_IP_);
+
+       return ret;
+}
+
+static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
+{
+       /*
+        * The barrier() in preempt_disable() prevents the compiler from
+        * bleeding the critical section out.
+        */
+       preempt_disable();
+       /*
+        * Same as in percpu_down_read().
+        */
+       if (likely(rcu_sync_is_idle(&sem->rss)))
+               __this_cpu_dec(*sem->read_count);
+       else
+               __percpu_up_read(sem); /* Unconditional memory barrier */
+       preempt_enable();
+
+       rwsem_release(&sem->rw_sem.dep_map, 1, _RET_IP_);
+}
 
 extern void percpu_down_write(struct percpu_rw_semaphore *);
 extern void percpu_up_write(struct percpu_rw_semaphore *);
 
 extern int __percpu_init_rwsem(struct percpu_rw_semaphore *,
                                const char *, struct lock_class_key *);
+
 extern void percpu_free_rwsem(struct percpu_rw_semaphore *);
 
-#define percpu_init_rwsem(brw) \
+#define percpu_init_rwsem(sem)                                 \
 ({                                                             \
        static struct lock_class_key rwsem_key;                 \
-       __percpu_init_rwsem(brw, #brw, &rwsem_key);             \
+       __percpu_init_rwsem(sem, #sem, &rwsem_key);             \
 })
 
-
 #define percpu_rwsem_is_held(sem) lockdep_is_held(&(sem)->rw_sem)
 
 static inline void percpu_rwsem_release(struct percpu_rw_semaphore *sem,
index dc1f2f30c961e3b0174527d75867df67c8f5878a..05d46ddb6f568ad2b3d8a8097f0d02f42d7f40b3 100644 (file)
@@ -116,7 +116,7 @@ struct arm_pmu {
        DECLARE_BITMAP(pmceid_bitmap, ARMV8_PMUV3_MAX_COMMON_EVENTS);
        struct platform_device  *plat_device;
        struct pmu_hw_events    __percpu *hw_events;
-       struct list_head        entry;
+       struct hlist_node       node;
        struct notifier_block   cpu_pm_nb;
        /* the attr_groups array must be NULL-terminated */
        const struct attribute_group *attr_groups[ARMPMU_NR_ATTR_GROUPS + 1];
index ccb73a58113dd845f67dae43dbfcf2992ea6f7b3..060d0ede88df6dfc34fbfcd1e60629d8dce5373d 100644 (file)
@@ -510,9 +510,15 @@ typedef void (*perf_overflow_handler_t)(struct perf_event *,
                                        struct perf_sample_data *,
                                        struct pt_regs *regs);
 
-enum perf_group_flag {
-       PERF_GROUP_SOFTWARE             = 0x1,
-};
+/*
+ * Event capabilities. For event_caps and groups caps.
+ *
+ * PERF_EV_CAP_SOFTWARE: Is a software event.
+ * PERF_EV_CAP_READ_ACTIVE_PKG: A CPU event (or cgroup event) that can be read
+ * from any CPU in the package where it is active.
+ */
+#define PERF_EV_CAP_SOFTWARE           BIT(0)
+#define PERF_EV_CAP_READ_ACTIVE_PKG    BIT(1)
 
 #define SWEVENT_HLIST_BITS             8
 #define SWEVENT_HLIST_SIZE             (1 << SWEVENT_HLIST_BITS)
@@ -568,7 +574,12 @@ struct perf_event {
        struct hlist_node               hlist_entry;
        struct list_head                active_entry;
        int                             nr_siblings;
-       int                             group_flags;
+
+       /* Not serialized. Only written during event initialization. */
+       int                             event_caps;
+       /* The cumulative AND of all event_caps for events in this group. */
+       int                             group_caps;
+
        struct perf_event               *group_leader;
        struct pmu                      *pmu;
        void                            *pmu_private;
@@ -778,6 +789,9 @@ struct perf_cpu_context {
 #ifdef CONFIG_CGROUP_PERF
        struct perf_cgroup              *cgrp;
 #endif
+
+       struct list_head                sched_cb_entry;
+       int                             sched_cb_usage;
 };
 
 struct perf_output_handle {
@@ -994,7 +1008,7 @@ static inline bool is_sampling_event(struct perf_event *event)
  */
 static inline int is_software_event(struct perf_event *event)
 {
-       return event->pmu->task_ctx_nr == perf_sw_context;
+       return event->event_caps & PERF_EV_CAP_SOFTWARE;
 }
 
 extern struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
index 1d405a2b72729e77c068bc2ce44fce737ed682bd..e4c08c1ff0c5906786f549c36fbc9b060f323e58 100644 (file)
@@ -4,11 +4,6 @@
 #include <linux/mm_types.h>
 #include <asm/mmu_context.h>
 
-#define PKEY_DISABLE_ACCESS    0x1
-#define PKEY_DISABLE_WRITE     0x2
-#define PKEY_ACCESS_MASK       (PKEY_DISABLE_ACCESS |\
-                                PKEY_DISABLE_WRITE)
-
 #ifdef CONFIG_ARCH_HAS_PKEYS
 #include <asm/pkeys.h>
 #else /* ! CONFIG_ARCH_HAS_PKEYS */
 #define execute_only_pkey(mm) (0)
 #define arch_override_mprotect_pkey(vma, prot, pkey) (0)
 #define PKEY_DEDICATED_EXECUTE_ONLY 0
-#endif /* ! CONFIG_ARCH_HAS_PKEYS */
+#define ARCH_VM_PKEY_FLAGS 0
+
+static inline bool mm_pkey_is_allocated(struct mm_struct *mm, int pkey)
+{
+       return (pkey == 0);
+}
+
+static inline int mm_pkey_alloc(struct mm_struct *mm)
+{
+       return -1;
+}
 
-/*
- * This is called from mprotect_pkey().
- *
- * Returns true if the protection keys is valid.
- */
-static inline bool validate_pkey(int pkey)
+static inline int mm_pkey_free(struct mm_struct *mm, int pkey)
 {
-       if (pkey < 0)
-               return false;
-       return (pkey < arch_max_pkey());
+       WARN_ONCE(1, "free of protection key when disabled");
+       return -EINVAL;
 }
 
+static inline int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
+                       unsigned long init_val)
+{
+       return 0;
+}
+
+static inline void copy_init_pkru_to_fpregs(void)
+{
+}
+
+#endif /* ! CONFIG_ARCH_HAS_PKEYS */
+
 #endif /* _LINUX_PKEYS_H */
index a63a33e6196e23905406e2dab831b17b5d883597..ece7ed9a4a7054abf6649489d53016117698fac9 100644 (file)
@@ -59,6 +59,7 @@ static inline bool rcu_sync_is_idle(struct rcu_sync *rsp)
 }
 
 extern void rcu_sync_init(struct rcu_sync *, enum rcu_sync_type);
+extern void rcu_sync_enter_start(struct rcu_sync *);
 extern void rcu_sync_enter(struct rcu_sync *);
 extern void rcu_sync_exit(struct rcu_sync *);
 extern void rcu_sync_dtor(struct rcu_sync *);
index d7c8359693c6e19cdf57c52316f2e223581f14b2..ecbb34a382b898cabf40d04237fbfae5534c5bc0 100644 (file)
@@ -19,6 +19,7 @@
 #include <linux/fs.h>
 #include <linux/poll.h>
 #include <linux/kref.h>
+#include <linux/percpu.h>
 
 /*
  * Tracks changes to rchan/rchan_buf structs
@@ -63,7 +64,7 @@ struct rchan
        struct kref kref;               /* channel refcount */
        void *private_data;             /* for user-defined data */
        size_t last_toobig;             /* tried to log event > subbuf size */
-       struct rchan_buf *buf[NR_CPUS]; /* per-cpu channel buffers */
+       struct rchan_buf ** __percpu buf; /* per-cpu channel buffers */
        int is_global;                  /* One global buffer ? */
        struct list_head list;          /* for channel list */
        struct dentry *parent;          /* parent dentry passed to open */
@@ -204,7 +205,7 @@ static inline void relay_write(struct rchan *chan,
        struct rchan_buf *buf;
 
        local_irq_save(flags);
-       buf = chan->buf[smp_processor_id()];
+       buf = *this_cpu_ptr(chan->buf);
        if (unlikely(buf->offset + length > chan->subbuf_size))
                length = relay_switch_subbuf(buf, length);
        memcpy(buf->data + buf->offset, data, length);
@@ -230,12 +231,12 @@ static inline void __relay_write(struct rchan *chan,
 {
        struct rchan_buf *buf;
 
-       buf = chan->buf[get_cpu()];
+       buf = *get_cpu_ptr(chan->buf);
        if (unlikely(buf->offset + length > buf->chan->subbuf_size))
                length = relay_switch_subbuf(buf, length);
        memcpy(buf->data + buf->offset, data, length);
        buf->offset += length;
-       put_cpu();
+       put_cpu_ptr(chan->buf);
 }
 
 /**
@@ -251,17 +252,19 @@ static inline void __relay_write(struct rchan *chan,
  */
 static inline void *relay_reserve(struct rchan *chan, size_t length)
 {
-       void *reserved;
-       struct rchan_buf *buf = chan->buf[smp_processor_id()];
+       void *reserved = NULL;
+       struct rchan_buf *buf = *get_cpu_ptr(chan->buf);
 
        if (unlikely(buf->offset + length > buf->chan->subbuf_size)) {
                length = relay_switch_subbuf(buf, length);
                if (!length)
-                       return NULL;
+                       goto end;
        }
        reserved = buf->data + buf->offset;
        buf->offset += length;
 
+end:
+       put_cpu_ptr(chan->buf);
        return reserved;
 }
 
@@ -285,5 +288,11 @@ static inline void subbuf_start_reserve(struct rchan_buf *buf,
  */
 extern const struct file_operations relay_file_operations;
 
+#ifdef CONFIG_RELAY
+int relay_prepare_cpu(unsigned int cpu);
+#else
+#define relay_prepare_cpu     NULL
+#endif
+
 #endif /* _LINUX_RELAY_H */
 
index b0fa726b7f31a625cb25667e55cb4d919a2271d4..06bd6ab542313770f01251b1af0400086f51c97f 100644 (file)
@@ -1022,7 +1022,8 @@ extern void wake_up_q(struct wake_q_head *head);
 #define SD_BALANCE_FORK                0x0008  /* Balance on fork, clone */
 #define SD_BALANCE_WAKE                0x0010  /* Balance on wakeup */
 #define SD_WAKE_AFFINE         0x0020  /* Wake task to waking CPU */
-#define SD_SHARE_CPUCAPACITY   0x0080  /* Domain members share cpu power */
+#define SD_ASYM_CPUCAPACITY    0x0040  /* Groups have different max cpu capacities */
+#define SD_SHARE_CPUCAPACITY   0x0080  /* Domain members share cpu capacity */
 #define SD_SHARE_POWERDOMAIN   0x0100  /* Domain members share power domain */
 #define SD_SHARE_PKG_RESOURCES 0x0200  /* Domain members share cpu pkg resources */
 #define SD_SERIALIZE           0x0400  /* Only a single load balancing instance */
@@ -1923,6 +1924,9 @@ struct task_struct {
 #ifdef CONFIG_MMU
        struct task_struct *oom_reaper_list;
 #endif
+#ifdef CONFIG_VMAP_STACK
+       struct vm_struct *stack_vm_area;
+#endif
 /* CPU-specific state of this task */
        struct thread_struct thread;
 /*
@@ -1939,6 +1943,18 @@ extern int arch_task_struct_size __read_mostly;
 # define arch_task_struct_size (sizeof(struct task_struct))
 #endif
 
+#ifdef CONFIG_VMAP_STACK
+static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t)
+{
+       return t->stack_vm_area;
+}
+#else
+static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t)
+{
+       return NULL;
+}
+#endif
+
 /* Future-safe accessor for struct task_struct's cpus_allowed. */
 #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
 
@@ -3236,6 +3252,15 @@ static inline void cond_resched_rcu(void)
 #endif
 }
 
+static inline unsigned long get_preempt_disable_ip(struct task_struct *p)
+{
+#ifdef CONFIG_DEBUG_PREEMPT
+       return p->preempt_disable_ip;
+#else
+       return 0;
+#endif
+}
+
 /*
  * Does a critical section need to be broken due to another
  * task waiting?: (technically does not depend on CONFIG_PREEMPT,
index 4293808d8cfb5d4a473f220735e53da87390998b..084b12bad198232426c6beb27cd348d72797b344 100644 (file)
@@ -650,4 +650,12 @@ static inline void *kzalloc_node(size_t size, gfp_t flags, int node)
 unsigned int kmem_cache_size(struct kmem_cache *s);
 void __init kmem_cache_init_late(void);
 
+#if defined(CONFIG_SMP) && defined(CONFIG_SLAB)
+int slab_prepare_cpu(unsigned int cpu);
+int slab_dead_cpu(unsigned int cpu);
+#else
+#define slab_prepare_cpu       NULL
+#define slab_dead_cpu          NULL
+#endif
+
 #endif /* _LINUX_SLAB_H */
index eccae4690f4153d60ad256f60ff4c969366211a8..8e0cb7a0f836faadd10ee917f8fe2c76d8965fa6 100644 (file)
@@ -196,6 +196,9 @@ extern void arch_enable_nonboot_cpus_end(void);
 
 void smp_setup_processor_id(void);
 
+int smp_call_on_cpu(unsigned int cpu, int (*func)(void *), void *par,
+                   bool phys);
+
 /* SMP core functions */
 int smpcfd_prepare_cpu(unsigned int cpu);
 int smpcfd_dead_cpu(unsigned int cpu);
index d02239022bd083d3253595f63c211c5f680b54c3..0d7abb8b7315ce3ab5162bc606c64337f3709d20 100644 (file)
@@ -898,4 +898,12 @@ asmlinkage long sys_copy_file_range(int fd_in, loff_t __user *off_in,
 
 asmlinkage long sys_mlock2(unsigned long start, size_t len, int flags);
 
+asmlinkage long sys_pkey_mprotect(unsigned long start, size_t len,
+                                 unsigned long prot, int pkey);
+asmlinkage long sys_pkey_alloc(unsigned long flags, unsigned long init_val);
+asmlinkage long sys_pkey_free(int pkey);
+//asmlinkage long sys_pkey_get(int pkey, unsigned long flags);
+//asmlinkage long sys_pkey_set(int pkey, unsigned long access_rights,
+//                          unsigned long flags);
+
 #endif
index 7e5d2fa9ac463ef1a9643d12b9f17ab63454cfbd..980c71b3001a535fa8e304030a6fb8002726cebf 100644 (file)
@@ -5,6 +5,7 @@
 #include <linux/math64.h>
 
 typedef __s64 time64_t;
+typedef __u64 timeu64_t;
 
 /*
  * This wants to go into uapi/linux/time.h once we agreed about the
index a72bd93ec7e5b9f47057ba136faec3a59e2ce587..996953db91d7dd9469634b8c9095b303cf7b7955 100644 (file)
@@ -33,6 +33,34 @@ TRACE_EVENT(cpuhp_enter,
                  __entry->cpu, __entry->target, __entry->idx, __entry->fun)
 );
 
+TRACE_EVENT(cpuhp_multi_enter,
+
+       TP_PROTO(unsigned int cpu,
+                int target,
+                int idx,
+                int (*fun)(unsigned int, struct hlist_node *),
+                struct hlist_node *node),
+
+       TP_ARGS(cpu, target, idx, fun, node),
+
+       TP_STRUCT__entry(
+               __field( unsigned int,  cpu             )
+               __field( int,           target          )
+               __field( int,           idx             )
+               __field( void *,        fun             )
+       ),
+
+       TP_fast_assign(
+               __entry->cpu    = cpu;
+               __entry->target = target;
+               __entry->idx    = idx;
+               __entry->fun    = fun;
+       ),
+
+       TP_printk("cpu: %04u target: %3d step: %3d (%pf)",
+                 __entry->cpu, __entry->target, __entry->idx, __entry->fun)
+);
+
 TRACE_EVENT(cpuhp_exit,
 
        TP_PROTO(unsigned int cpu,
index 58274382a6164de8e9d03ec85da0cf07996869d6..8c27db0c5c08ce84fc7299bd9bff2cee571b9397 100644 (file)
@@ -72,4 +72,9 @@
 #define MAP_HUGE_SHIFT 26
 #define MAP_HUGE_MASK  0x3f
 
+#define PKEY_DISABLE_ACCESS    0x1
+#define PKEY_DISABLE_WRITE     0x2
+#define PKEY_ACCESS_MASK       (PKEY_DISABLE_ACCESS |\
+                                PKEY_DISABLE_WRITE)
+
 #endif /* __ASM_GENERIC_MMAN_COMMON_H */
index a26415b5151ce1866e805129de53331c76724813..dbfee7e86ba6429319188a03d618881ae34a9ef0 100644 (file)
@@ -724,9 +724,19 @@ __SYSCALL(__NR_copy_file_range, sys_copy_file_range)
 __SC_COMP(__NR_preadv2, sys_preadv2, compat_sys_preadv2)
 #define __NR_pwritev2 287
 __SC_COMP(__NR_pwritev2, sys_pwritev2, compat_sys_pwritev2)
+#define __NR_pkey_mprotect 288
+__SYSCALL(__NR_pkey_mprotect, sys_pkey_mprotect)
+#define __NR_pkey_alloc 289
+__SYSCALL(__NR_pkey_alloc,    sys_pkey_alloc)
+#define __NR_pkey_free 290
+__SYSCALL(__NR_pkey_free,     sys_pkey_free)
+#define __NR_pkey_get 291
+//__SYSCALL(__NR_pkey_get,      sys_pkey_get)
+#define __NR_pkey_set 292
+//__SYSCALL(__NR_pkey_set,      sys_pkey_set)
 
 #undef __NR_syscalls
-#define __NR_syscalls 288
+#define __NR_syscalls 291
 
 /*
  * All syscalls below here should go away really,
index f18490985fc8e5f39d10ed442d302293ac0e7699..a4c4d735d7813d0a23e8795b027511f27c9b85c8 100644 (file)
@@ -3,6 +3,24 @@
  *
  * Scheduler state interactions
  *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
  * Copyright (c) 2005, Keir Fraser <keir@xensource.com>
  */
 
 
 #include <xen/interface/event_channel.h>
 
+/*
+ * Guest Scheduler Operations
+ *
+ * The SCHEDOP interface provides mechanisms for a guest to interact
+ * with the scheduler, including yield, blocking and shutting itself
+ * down.
+ */
+
 /*
  * The prototype for this hypercall is:
- *  long sched_op_new(int cmd, void *arg)
+ * long HYPERVISOR_sched_op(enum sched_op cmd, void *arg, ...)
+ *
  * @cmd == SCHEDOP_??? (scheduler operation).
  * @arg == Operation-specific extra argument(s), as described below.
+ * ...  == Additional Operation-specific extra arguments, described below.
  *
- * **NOTE**:
- * Versions of Xen prior to 3.0.2 provide only the following legacy version
+ * Versions of Xen prior to 3.0.2 provided only the following legacy version
  * of this hypercall, supporting only the commands yield, block and shutdown:
  *  long sched_op(int cmd, unsigned long arg)
  * @cmd == SCHEDOP_??? (scheduler operation).
  * @arg == 0               (SCHEDOP_yield and SCHEDOP_block)
  *      == SHUTDOWN_* code (SCHEDOP_shutdown)
+ *
+ * This legacy version is available to new guests as:
+ * long HYPERVISOR_sched_op_compat(enum sched_op cmd, unsigned long arg)
  */
 
 /*
 /*
  * Halt execution of this domain (all VCPUs) and notify the system controller.
  * @arg == pointer to sched_shutdown structure.
+ *
+ * If the sched_shutdown_t reason is SHUTDOWN_suspend then
+ * x86 PV guests must also set RDX (EDX for 32-bit guests) to the MFN
+ * of the guest's start info page.  RDX/EDX is the third hypercall
+ * argument.
+ *
+ * In addition, which reason is SHUTDOWN_suspend this hypercall
+ * returns 1 if suspend was cancelled or the domain was merely
+ * checkpointed, and 0 if it is resuming in a new domain.
  */
 #define SCHEDOP_shutdown    2
-struct sched_shutdown {
-    unsigned int reason; /* SHUTDOWN_* */
-};
-DEFINE_GUEST_HANDLE_STRUCT(sched_shutdown);
 
 /*
  * Poll a set of event-channel ports. Return when one or more are pending. An
@@ -57,12 +92,6 @@ DEFINE_GUEST_HANDLE_STRUCT(sched_shutdown);
  * @arg == pointer to sched_poll structure.
  */
 #define SCHEDOP_poll        3
-struct sched_poll {
-    GUEST_HANDLE(evtchn_port_t) ports;
-    unsigned int nr_ports;
-    uint64_t timeout;
-};
-DEFINE_GUEST_HANDLE_STRUCT(sched_poll);
 
 /*
  * Declare a shutdown for another domain. The main use of this function is
@@ -71,15 +100,11 @@ DEFINE_GUEST_HANDLE_STRUCT(sched_poll);
  * @arg == pointer to sched_remote_shutdown structure.
  */
 #define SCHEDOP_remote_shutdown        4
-struct sched_remote_shutdown {
-    domid_t domain_id;         /* Remote domain ID */
-    unsigned int reason;       /* SHUTDOWN_xxx reason */
-};
 
 /*
  * Latch a shutdown code, so that when the domain later shuts down it
  * reports this code to the control tools.
- * @arg == as for SCHEDOP_shutdown.
+ * @arg == sched_shutdown, as for SCHEDOP_shutdown.
  */
 #define SCHEDOP_shutdown_code 5
 
@@ -92,10 +117,47 @@ struct sched_remote_shutdown {
  * With id != 0 and timeout != 0, poke watchdog timer and set new timeout.
  */
 #define SCHEDOP_watchdog    6
+
+/*
+ * Override the current vcpu affinity by pinning it to one physical cpu or
+ * undo this override restoring the previous affinity.
+ * @arg == pointer to sched_pin_override structure.
+ *
+ * A negative pcpu value will undo a previous pin override and restore the
+ * previous cpu affinity.
+ * This call is allowed for the hardware domain only and requires the cpu
+ * to be part of the domain's cpupool.
+ */
+#define SCHEDOP_pin_override 7
+
+struct sched_shutdown {
+    unsigned int reason; /* SHUTDOWN_* => shutdown reason */
+};
+DEFINE_GUEST_HANDLE_STRUCT(sched_shutdown);
+
+struct sched_poll {
+    GUEST_HANDLE(evtchn_port_t) ports;
+    unsigned int nr_ports;
+    uint64_t timeout;
+};
+DEFINE_GUEST_HANDLE_STRUCT(sched_poll);
+
+struct sched_remote_shutdown {
+    domid_t domain_id;         /* Remote domain ID */
+    unsigned int reason;       /* SHUTDOWN_* => shutdown reason */
+};
+DEFINE_GUEST_HANDLE_STRUCT(sched_remote_shutdown);
+
 struct sched_watchdog {
     uint32_t id;                /* watchdog ID */
     uint32_t timeout;           /* timeout */
 };
+DEFINE_GUEST_HANDLE_STRUCT(sched_watchdog);
+
+struct sched_pin_override {
+    int32_t pcpu;
+};
+DEFINE_GUEST_HANDLE_STRUCT(sched_pin_override);
 
 /*
  * Reason codes for SCHEDOP_shutdown. These may be interpreted by control
@@ -107,6 +169,7 @@ struct sched_watchdog {
 #define SHUTDOWN_suspend    2  /* Clean up, save suspend info, kill.         */
 #define SHUTDOWN_crash      3  /* Tell controller we've crashed.             */
 #define SHUTDOWN_watchdog   4  /* Restart because watchdog time expired.     */
+
 /*
  * Domain asked to perform 'soft reset' for it. The expected behavior is to
  * reset internal Xen state for the domain returning it to the point where it
@@ -115,5 +178,6 @@ struct sched_watchdog {
  * interfaces again.
  */
 #define SHUTDOWN_soft_reset 5
+#define SHUTDOWN_MAX        5  /* Maximum valid shutdown reason.             */
 
 #endif /* __XEN_PUBLIC_SCHED_H__ */
index d1c51b7f5221fbfbb85ed3d12e4918a970713829..9f51cdf58f5a127002e4fa6d10ac677fc9141675 100644 (file)
@@ -5606,6 +5606,12 @@ int __init cgroup_init(void)
        BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
        BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
 
+       /*
+        * The latency of the synchronize_sched() is too high for cgroups,
+        * avoid it at the cost of forcing all readers into the slow path.
+        */
+       rcu_sync_enter_start(&cgroup_threadgroup_rwsem.rss);
+
        get_user_ns(init_cgroup_ns.user_ns);
 
        mutex_lock(&cgroup_mutex);
index ebbf027dd4a13accdfd433d0cc50eb33182aa5cf..e7eca02c757f7ede7743cabb00aa64c787e90e7a 100644 (file)
@@ -23,6 +23,8 @@
 #include <linux/tick.h>
 #include <linux/irq.h>
 #include <linux/smpboot.h>
+#include <linux/relay.h>
+#include <linux/slab.h>
 
 #include <trace/events/power.h>
 #define CREATE_TRACE_POINTS
@@ -37,8 +39,9 @@
  * @thread:    Pointer to the hotplug thread
  * @should_run:        Thread should execute
  * @rollback:  Perform a rollback
- * @cb_stat:   The state for a single callback (install/uninstall)
- * @cb:                Single callback function (install/uninstall)
+ * @single:    Single callback invocation
+ * @bringup:   Single callback bringup or teardown selector
+ * @cb_state:  The state for a single callback (install/uninstall)
  * @result:    Result of the operation
  * @done:      Signal completion to the issuer of the task
  */
@@ -49,8 +52,10 @@ struct cpuhp_cpu_state {
        struct task_struct      *thread;
        bool                    should_run;
        bool                    rollback;
+       bool                    single;
+       bool                    bringup;
+       struct hlist_node       *node;
        enum cpuhp_state        cb_state;
-       int                     (*cb)(unsigned int cpu);
        int                     result;
        struct completion       done;
 #endif
@@ -68,35 +73,103 @@ static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state);
  * @cant_stop: Bringup/teardown can't be stopped at this step
  */
 struct cpuhp_step {
-       const char      *name;
-       int             (*startup)(unsigned int cpu);
-       int             (*teardown)(unsigned int cpu);
-       bool            skip_onerr;
-       bool            cant_stop;
+       const char              *name;
+       union {
+               int             (*single)(unsigned int cpu);
+               int             (*multi)(unsigned int cpu,
+                                        struct hlist_node *node);
+       } startup;
+       union {
+               int             (*single)(unsigned int cpu);
+               int             (*multi)(unsigned int cpu,
+                                        struct hlist_node *node);
+       } teardown;
+       struct hlist_head       list;
+       bool                    skip_onerr;
+       bool                    cant_stop;
+       bool                    multi_instance;
 };
 
 static DEFINE_MUTEX(cpuhp_state_mutex);
 static struct cpuhp_step cpuhp_bp_states[];
 static struct cpuhp_step cpuhp_ap_states[];
 
+static bool cpuhp_is_ap_state(enum cpuhp_state state)
+{
+       /*
+        * The extra check for CPUHP_TEARDOWN_CPU is only for documentation
+        * purposes as that state is handled explicitly in cpu_down.
+        */
+       return state > CPUHP_BRINGUP_CPU && state != CPUHP_TEARDOWN_CPU;
+}
+
+static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state)
+{
+       struct cpuhp_step *sp;
+
+       sp = cpuhp_is_ap_state(state) ? cpuhp_ap_states : cpuhp_bp_states;
+       return sp + state;
+}
+
 /**
  * cpuhp_invoke_callback _ Invoke the callbacks for a given state
  * @cpu:       The cpu for which the callback should be invoked
  * @step:      The step in the state machine
- * @cb:                The callback function to invoke
+ * @bringup:   True if the bringup callback should be invoked
  *
- * Called from cpu hotplug and from the state register machinery
+ * Called from cpu hotplug and from the state register machinery.
  */
-static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state step,
-                                int (*cb)(unsigned int))
+static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state,
+                                bool bringup, struct hlist_node *node)
 {
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
-       int ret = 0;
-
-       if (cb) {
-               trace_cpuhp_enter(cpu, st->target, step, cb);
+       struct cpuhp_step *step = cpuhp_get_step(state);
+       int (*cbm)(unsigned int cpu, struct hlist_node *node);
+       int (*cb)(unsigned int cpu);
+       int ret, cnt;
+
+       if (!step->multi_instance) {
+               cb = bringup ? step->startup.single : step->teardown.single;
+               if (!cb)
+                       return 0;
+               trace_cpuhp_enter(cpu, st->target, state, cb);
                ret = cb(cpu);
-               trace_cpuhp_exit(cpu, st->state, step, ret);
+               trace_cpuhp_exit(cpu, st->state, state, ret);
+               return ret;
+       }
+       cbm = bringup ? step->startup.multi : step->teardown.multi;
+       if (!cbm)
+               return 0;
+
+       /* Single invocation for instance add/remove */
+       if (node) {
+               trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node);
+               ret = cbm(cpu, node);
+               trace_cpuhp_exit(cpu, st->state, state, ret);
+               return ret;
+       }
+
+       /* State transition. Invoke on all instances */
+       cnt = 0;
+       hlist_for_each(node, &step->list) {
+               trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node);
+               ret = cbm(cpu, node);
+               trace_cpuhp_exit(cpu, st->state, state, ret);
+               if (ret)
+                       goto err;
+               cnt++;
+       }
+       return 0;
+err:
+       /* Rollback the instances if one failed */
+       cbm = !bringup ? step->startup.multi : step->teardown.multi;
+       if (!cbm)
+               return ret;
+
+       hlist_for_each(node, &step->list) {
+               if (!cnt--)
+                       break;
+               cbm(cpu, node);
        }
        return ret;
 }
@@ -260,10 +333,17 @@ void cpu_hotplug_disable(void)
 }
 EXPORT_SYMBOL_GPL(cpu_hotplug_disable);
 
+static void __cpu_hotplug_enable(void)
+{
+       if (WARN_ONCE(!cpu_hotplug_disabled, "Unbalanced cpu hotplug enable\n"))
+               return;
+       cpu_hotplug_disabled--;
+}
+
 void cpu_hotplug_enable(void)
 {
        cpu_maps_update_begin();
-       WARN_ON(--cpu_hotplug_disabled < 0);
+       __cpu_hotplug_enable();
        cpu_maps_update_done();
 }
 EXPORT_SYMBOL_GPL(cpu_hotplug_enable);
@@ -330,12 +410,6 @@ static int notify_online(unsigned int cpu)
        return 0;
 }
 
-static int notify_starting(unsigned int cpu)
-{
-       cpu_notify(CPU_STARTING, cpu);
-       return 0;
-}
-
 static int bringup_wait_for_ap(unsigned int cpu)
 {
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
@@ -349,8 +423,16 @@ static int bringup_cpu(unsigned int cpu)
        struct task_struct *idle = idle_thread_get(cpu);
        int ret;
 
+       /*
+        * Some architectures have to walk the irq descriptors to
+        * setup the vector space for the cpu which comes online.
+        * Prevent irq alloc/free across the bringup.
+        */
+       irq_lock_sparse();
+
        /* Arch-specific enabling code. */
        ret = __cpu_up(cpu, idle);
+       irq_unlock_sparse();
        if (ret) {
                cpu_notify(CPU_UP_CANCELED, cpu);
                return ret;
@@ -363,62 +445,55 @@ static int bringup_cpu(unsigned int cpu)
 /*
  * Hotplug state machine related functions
  */
-static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st,
-                         struct cpuhp_step *steps)
+static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st)
 {
        for (st->state++; st->state < st->target; st->state++) {
-               struct cpuhp_step *step = steps + st->state;
+               struct cpuhp_step *step = cpuhp_get_step(st->state);
 
                if (!step->skip_onerr)
-                       cpuhp_invoke_callback(cpu, st->state, step->startup);
+                       cpuhp_invoke_callback(cpu, st->state, true, NULL);
        }
 }
 
 static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
-                               struct cpuhp_step *steps, enum cpuhp_state target)
+                               enum cpuhp_state target)
 {
        enum cpuhp_state prev_state = st->state;
        int ret = 0;
 
        for (; st->state > target; st->state--) {
-               struct cpuhp_step *step = steps + st->state;
-
-               ret = cpuhp_invoke_callback(cpu, st->state, step->teardown);
+               ret = cpuhp_invoke_callback(cpu, st->state, false, NULL);
                if (ret) {
                        st->target = prev_state;
-                       undo_cpu_down(cpu, st, steps);
+                       undo_cpu_down(cpu, st);
                        break;
                }
        }
        return ret;
 }
 
-static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st,
-                       struct cpuhp_step *steps)
+static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st)
 {
        for (st->state--; st->state > st->target; st->state--) {
-               struct cpuhp_step *step = steps + st->state;
+               struct cpuhp_step *step = cpuhp_get_step(st->state);
 
                if (!step->skip_onerr)
-                       cpuhp_invoke_callback(cpu, st->state, step->teardown);
+                       cpuhp_invoke_callback(cpu, st->state, false, NULL);
        }
 }
 
 static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
-                             struct cpuhp_step *steps, enum cpuhp_state target)
+                             enum cpuhp_state target)
 {
        enum cpuhp_state prev_state = st->state;
        int ret = 0;
 
        while (st->state < target) {
-               struct cpuhp_step *step;
-
                st->state++;
-               step = steps + st->state;
-               ret = cpuhp_invoke_callback(cpu, st->state, step->startup);
+               ret = cpuhp_invoke_callback(cpu, st->state, true, NULL);
                if (ret) {
                        st->target = prev_state;
-                       undo_cpu_up(cpu, st, steps);
+                       undo_cpu_up(cpu, st);
                        break;
                }
        }
@@ -447,13 +522,13 @@ static int cpuhp_ap_offline(unsigned int cpu, struct cpuhp_cpu_state *st)
 {
        enum cpuhp_state target = max((int)st->target, CPUHP_TEARDOWN_CPU);
 
-       return cpuhp_down_callbacks(cpu, st, cpuhp_ap_states, target);
+       return cpuhp_down_callbacks(cpu, st, target);
 }
 
 /* Execute the online startup callbacks. Used to be CPU_ONLINE */
 static int cpuhp_ap_online(unsigned int cpu, struct cpuhp_cpu_state *st)
 {
-       return cpuhp_up_callbacks(cpu, st, cpuhp_ap_states, st->target);
+       return cpuhp_up_callbacks(cpu, st, st->target);
 }
 
 /*
@@ -476,18 +551,20 @@ static void cpuhp_thread_fun(unsigned int cpu)
        st->should_run = false;
 
        /* Single callback invocation for [un]install ? */
-       if (st->cb) {
+       if (st->single) {
                if (st->cb_state < CPUHP_AP_ONLINE) {
                        local_irq_disable();
-                       ret = cpuhp_invoke_callback(cpu, st->cb_state, st->cb);
+                       ret = cpuhp_invoke_callback(cpu, st->cb_state,
+                                                   st->bringup, st->node);
                        local_irq_enable();
                } else {
-                       ret = cpuhp_invoke_callback(cpu, st->cb_state, st->cb);
+                       ret = cpuhp_invoke_callback(cpu, st->cb_state,
+                                                   st->bringup, st->node);
                }
        } else if (st->rollback) {
                BUG_ON(st->state < CPUHP_AP_ONLINE_IDLE);
 
-               undo_cpu_down(cpu, st, cpuhp_ap_states);
+               undo_cpu_down(cpu, st);
                /*
                 * This is a momentary workaround to keep the notifier users
                 * happy. Will go away once we got rid of the notifiers.
@@ -509,8 +586,9 @@ static void cpuhp_thread_fun(unsigned int cpu)
 }
 
 /* Invoke a single callback on a remote cpu */
-static int cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state,
-                                   int (*cb)(unsigned int))
+static int
+cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, bool bringup,
+                        struct hlist_node *node)
 {
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
 
@@ -522,10 +600,13 @@ static int cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state,
         * we invoke the thread function directly.
         */
        if (!st->thread)
-               return cpuhp_invoke_callback(cpu, state, cb);
+               return cpuhp_invoke_callback(cpu, state, bringup, node);
 
        st->cb_state = state;
-       st->cb = cb;
+       st->single = true;
+       st->bringup = bringup;
+       st->node = node;
+
        /*
         * Make sure the above stores are visible before should_run becomes
         * true. Paired with the mb() above in cpuhp_thread_fun()
@@ -541,7 +622,7 @@ static int cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state,
 static void __cpuhp_kick_ap_work(struct cpuhp_cpu_state *st)
 {
        st->result = 0;
-       st->cb = NULL;
+       st->single = false;
        /*
         * Make sure the above stores are visible before should_run becomes
         * true. Paired with the mb() above in cpuhp_thread_fun()
@@ -674,12 +755,6 @@ static int notify_down_prepare(unsigned int cpu)
        return err;
 }
 
-static int notify_dying(unsigned int cpu)
-{
-       cpu_notify(CPU_DYING, cpu);
-       return 0;
-}
-
 /* Take this CPU down. */
 static int take_cpu_down(void *_param)
 {
@@ -692,12 +767,16 @@ static int take_cpu_down(void *_param)
        if (err < 0)
                return err;
 
+       /*
+        * We get here while we are in CPUHP_TEARDOWN_CPU state and we must not
+        * do this step again.
+        */
+       WARN_ON(st->state != CPUHP_TEARDOWN_CPU);
+       st->state--;
        /* Invoke the former CPU_DYING callbacks */
-       for (; st->state > target; st->state--) {
-               struct cpuhp_step *step = cpuhp_ap_states + st->state;
+       for (; st->state > target; st->state--)
+               cpuhp_invoke_callback(cpu, st->state, false, NULL);
 
-               cpuhp_invoke_callback(cpu, st->state, step->teardown);
-       }
        /* Give up timekeeping duties */
        tick_handover_do_timer();
        /* Park the stopper thread */
@@ -734,7 +813,7 @@ static int takedown_cpu(unsigned int cpu)
        BUG_ON(cpu_online(cpu));
 
        /*
-        * The migration_call() CPU_DYING callback will have removed all
+        * The CPUHP_AP_SCHED_MIGRATE_DYING callback will have removed all
         * runnable tasks from the cpu, there's only the idle task left now
         * that the migration thread is done doing the stop_machine thing.
         *
@@ -787,7 +866,6 @@ void cpuhp_report_idle_dead(void)
 #define notify_down_prepare    NULL
 #define takedown_cpu           NULL
 #define notify_dead            NULL
-#define notify_dying           NULL
 #endif
 
 #ifdef CONFIG_HOTPLUG_CPU
@@ -836,7 +914,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
         * The AP brought itself down to CPUHP_TEARDOWN_CPU. So we need
         * to do the further cleanups.
         */
-       ret = cpuhp_down_callbacks(cpu, st, cpuhp_bp_states, target);
+       ret = cpuhp_down_callbacks(cpu, st, target);
        if (ret && st->state > CPUHP_TEARDOWN_CPU && st->state < prev_state) {
                st->target = prev_state;
                st->rollback = true;
@@ -877,10 +955,9 @@ EXPORT_SYMBOL(cpu_down);
 #endif /*CONFIG_HOTPLUG_CPU*/
 
 /**
- * notify_cpu_starting(cpu) - call the CPU_STARTING notifiers
+ * notify_cpu_starting(cpu) - Invoke the callbacks on the starting CPU
  * @cpu: cpu that just started
  *
- * This function calls the cpu_chain notifiers with CPU_STARTING.
  * It must be called by the arch code on the new cpu, before the new cpu
  * enables interrupts and before the "boot" cpu returns from __cpu_up().
  */
@@ -890,11 +967,8 @@ void notify_cpu_starting(unsigned int cpu)
        enum cpuhp_state target = min((int)st->target, CPUHP_AP_ONLINE);
 
        while (st->state < target) {
-               struct cpuhp_step *step;
-
                st->state++;
-               step = cpuhp_ap_states + st->state;
-               cpuhp_invoke_callback(cpu, st->state, step->startup);
+               cpuhp_invoke_callback(cpu, st->state, true, NULL);
        }
 }
 
@@ -979,7 +1053,7 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target)
         * responsible for bringing it up to the target state.
         */
        target = min((int)target, CPUHP_BRINGUP_CPU);
-       ret = cpuhp_up_callbacks(cpu, st, cpuhp_bp_states, target);
+       ret = cpuhp_up_callbacks(cpu, st, target);
 out:
        cpu_hotplug_done();
        return ret;
@@ -1082,7 +1156,7 @@ void enable_nonboot_cpus(void)
 
        /* Allow everyone to use the CPU hotplug again */
        cpu_maps_update_begin();
-       WARN_ON(--cpu_hotplug_disabled < 0);
+       __cpu_hotplug_enable();
        if (cpumask_empty(frozen_cpus))
                goto out;
 
@@ -1171,40 +1245,50 @@ core_initcall(cpu_hotplug_pm_sync_init);
 static struct cpuhp_step cpuhp_bp_states[] = {
        [CPUHP_OFFLINE] = {
                .name                   = "offline",
-               .startup                = NULL,
-               .teardown               = NULL,
+               .startup.single         = NULL,
+               .teardown.single        = NULL,
        },
 #ifdef CONFIG_SMP
        [CPUHP_CREATE_THREADS]= {
-               .name                   = "threads:create",
-               .startup                = smpboot_create_threads,
-               .teardown               = NULL,
+               .name                   = "threads:prepare",
+               .startup.single         = smpboot_create_threads,
+               .teardown.single        = NULL,
                .cant_stop              = true,
        },
        [CPUHP_PERF_PREPARE] = {
-               .name = "perf prepare",
-               .startup = perf_event_init_cpu,
-               .teardown = perf_event_exit_cpu,
+               .name                   = "perf:prepare",
+               .startup.single         = perf_event_init_cpu,
+               .teardown.single        = perf_event_exit_cpu,
        },
        [CPUHP_WORKQUEUE_PREP] = {
-               .name = "workqueue prepare",
-               .startup = workqueue_prepare_cpu,
-               .teardown = NULL,
+               .name                   = "workqueue:prepare",
+               .startup.single         = workqueue_prepare_cpu,
+               .teardown.single        = NULL,
        },
        [CPUHP_HRTIMERS_PREPARE] = {
-               .name = "hrtimers prepare",
-               .startup = hrtimers_prepare_cpu,
-               .teardown = hrtimers_dead_cpu,
+               .name                   = "hrtimers:prepare",
+               .startup.single         = hrtimers_prepare_cpu,
+               .teardown.single        = hrtimers_dead_cpu,
        },
        [CPUHP_SMPCFD_PREPARE] = {
-               .name = "SMPCFD prepare",
-               .startup = smpcfd_prepare_cpu,
-               .teardown = smpcfd_dead_cpu,
+               .name                   = "smpcfd:prepare",
+               .startup.single         = smpcfd_prepare_cpu,
+               .teardown.single        = smpcfd_dead_cpu,
+       },
+       [CPUHP_RELAY_PREPARE] = {
+               .name                   = "relay:prepare",
+               .startup.single         = relay_prepare_cpu,
+               .teardown.single        = NULL,
+       },
+       [CPUHP_SLAB_PREPARE] = {
+               .name                   = "slab:prepare",
+               .startup.single         = slab_prepare_cpu,
+               .teardown.single        = slab_dead_cpu,
        },
        [CPUHP_RCUTREE_PREP] = {
-               .name = "RCU-tree prepare",
-               .startup = rcutree_prepare_cpu,
-               .teardown = rcutree_dead_cpu,
+               .name                   = "RCU/tree:prepare",
+               .startup.single         = rcutree_prepare_cpu,
+               .teardown.single        = rcutree_dead_cpu,
        },
        /*
         * Preparatory and dead notifiers. Will be replaced once the notifiers
@@ -1212,8 +1296,8 @@ static struct cpuhp_step cpuhp_bp_states[] = {
         */
        [CPUHP_NOTIFY_PREPARE] = {
                .name                   = "notify:prepare",
-               .startup                = notify_prepare,
-               .teardown               = notify_dead,
+               .startup.single         = notify_prepare,
+               .teardown.single        = notify_dead,
                .skip_onerr             = true,
                .cant_stop              = true,
        },
@@ -1223,20 +1307,21 @@ static struct cpuhp_step cpuhp_bp_states[] = {
         * otherwise a RCU stall occurs.
         */
        [CPUHP_TIMERS_DEAD] = {
-               .name = "timers dead",
-               .startup = NULL,
-               .teardown = timers_dead_cpu,
+               .name                   = "timers:dead",
+               .startup.single         = NULL,
+               .teardown.single        = timers_dead_cpu,
        },
        /* Kicks the plugged cpu into life */
        [CPUHP_BRINGUP_CPU] = {
                .name                   = "cpu:bringup",
-               .startup                = bringup_cpu,
-               .teardown               = NULL,
+               .startup.single         = bringup_cpu,
+               .teardown.single        = NULL,
                .cant_stop              = true,
        },
        [CPUHP_AP_SMPCFD_DYING] = {
-               .startup = NULL,
-               .teardown = smpcfd_dying_cpu,
+               .name                   = "smpcfd:dying",
+               .startup.single         = NULL,
+               .teardown.single        = smpcfd_dying_cpu,
        },
        /*
         * Handled on controll processor until the plugged processor manages
@@ -1244,8 +1329,8 @@ static struct cpuhp_step cpuhp_bp_states[] = {
         */
        [CPUHP_TEARDOWN_CPU] = {
                .name                   = "cpu:teardown",
-               .startup                = NULL,
-               .teardown               = takedown_cpu,
+               .startup.single         = NULL,
+               .teardown.single        = takedown_cpu,
                .cant_stop              = true,
        },
 #else
@@ -1271,24 +1356,13 @@ static struct cpuhp_step cpuhp_ap_states[] = {
        /* First state is scheduler control. Interrupts are disabled */
        [CPUHP_AP_SCHED_STARTING] = {
                .name                   = "sched:starting",
-               .startup                = sched_cpu_starting,
-               .teardown               = sched_cpu_dying,
+               .startup.single         = sched_cpu_starting,
+               .teardown.single        = sched_cpu_dying,
        },
        [CPUHP_AP_RCUTREE_DYING] = {
-               .startup = NULL,
-               .teardown = rcutree_dying_cpu,
-       },
-       /*
-        * Low level startup/teardown notifiers. Run with interrupts
-        * disabled. Will be removed once the notifiers are converted to
-        * states.
-        */
-       [CPUHP_AP_NOTIFY_STARTING] = {
-               .name                   = "notify:starting",
-               .startup                = notify_starting,
-               .teardown               = notify_dying,
-               .skip_onerr             = true,
-               .cant_stop              = true,
+               .name                   = "RCU/tree:dying",
+               .startup.single         = NULL,
+               .teardown.single        = rcutree_dying_cpu,
        },
        /* Entry state on starting. Interrupts enabled from here on. Transient
         * state for synchronsization */
@@ -1297,24 +1371,24 @@ static struct cpuhp_step cpuhp_ap_states[] = {
        },
        /* Handle smpboot threads park/unpark */
        [CPUHP_AP_SMPBOOT_THREADS] = {
-               .name                   = "smpboot:threads",
-               .startup                = smpboot_unpark_threads,
-               .teardown               = NULL,
+               .name                   = "smpboot/threads:online",
+               .startup.single         = smpboot_unpark_threads,
+               .teardown.single        = NULL,
        },
        [CPUHP_AP_PERF_ONLINE] = {
-               .name = "perf online",
-               .startup = perf_event_init_cpu,
-               .teardown = perf_event_exit_cpu,
+               .name                   = "perf:online",
+               .startup.single         = perf_event_init_cpu,
+               .teardown.single        = perf_event_exit_cpu,
        },
        [CPUHP_AP_WORKQUEUE_ONLINE] = {
-               .name = "workqueue online",
-               .startup = workqueue_online_cpu,
-               .teardown = workqueue_offline_cpu,
+               .name                   = "workqueue:online",
+               .startup.single         = workqueue_online_cpu,
+               .teardown.single        = workqueue_offline_cpu,
        },
        [CPUHP_AP_RCUTREE_ONLINE] = {
-               .name = "RCU-tree online",
-               .startup = rcutree_online_cpu,
-               .teardown = rcutree_offline_cpu,
+               .name                   = "RCU/tree:online",
+               .startup.single         = rcutree_online_cpu,
+               .teardown.single        = rcutree_offline_cpu,
        },
 
        /*
@@ -1323,8 +1397,8 @@ static struct cpuhp_step cpuhp_ap_states[] = {
         */
        [CPUHP_AP_NOTIFY_ONLINE] = {
                .name                   = "notify:online",
-               .startup                = notify_online,
-               .teardown               = notify_down_prepare,
+               .startup.single         = notify_online,
+               .teardown.single        = notify_down_prepare,
                .skip_onerr             = true,
        },
 #endif
@@ -1336,16 +1410,16 @@ static struct cpuhp_step cpuhp_ap_states[] = {
        /* Last state is scheduler control setting the cpu active */
        [CPUHP_AP_ACTIVE] = {
                .name                   = "sched:active",
-               .startup                = sched_cpu_activate,
-               .teardown               = sched_cpu_deactivate,
+               .startup.single         = sched_cpu_activate,
+               .teardown.single        = sched_cpu_deactivate,
        },
 #endif
 
        /* CPU is fully up and running. */
        [CPUHP_ONLINE] = {
                .name                   = "online",
-               .startup                = NULL,
-               .teardown               = NULL,
+               .startup.single         = NULL,
+               .teardown.single        = NULL,
        },
 };
 
@@ -1357,54 +1431,42 @@ static int cpuhp_cb_check(enum cpuhp_state state)
        return 0;
 }
 
-static bool cpuhp_is_ap_state(enum cpuhp_state state)
-{
-       /*
-        * The extra check for CPUHP_TEARDOWN_CPU is only for documentation
-        * purposes as that state is handled explicitely in cpu_down.
-        */
-       return state > CPUHP_BRINGUP_CPU && state != CPUHP_TEARDOWN_CPU;
-}
-
-static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state)
-{
-       struct cpuhp_step *sp;
-
-       sp = cpuhp_is_ap_state(state) ? cpuhp_ap_states : cpuhp_bp_states;
-       return sp + state;
-}
-
 static void cpuhp_store_callbacks(enum cpuhp_state state,
                                  const char *name,
                                  int (*startup)(unsigned int cpu),
-                                 int (*teardown)(unsigned int cpu))
+                                 int (*teardown)(unsigned int cpu),
+                                 bool multi_instance)
 {
        /* (Un)Install the callbacks for further cpu hotplug operations */
        struct cpuhp_step *sp;
 
        mutex_lock(&cpuhp_state_mutex);
        sp = cpuhp_get_step(state);
-       sp->startup = startup;
-       sp->teardown = teardown;
+       sp->startup.single = startup;
+       sp->teardown.single = teardown;
        sp->name = name;
+       sp->multi_instance = multi_instance;
+       INIT_HLIST_HEAD(&sp->list);
        mutex_unlock(&cpuhp_state_mutex);
 }
 
 static void *cpuhp_get_teardown_cb(enum cpuhp_state state)
 {
-       return cpuhp_get_step(state)->teardown;
+       return cpuhp_get_step(state)->teardown.single;
 }
 
 /*
  * Call the startup/teardown function for a step either on the AP or
  * on the current CPU.
  */
-static int cpuhp_issue_call(int cpu, enum cpuhp_state state,
-                           int (*cb)(unsigned int), bool bringup)
+static int cpuhp_issue_call(int cpu, enum cpuhp_state state, bool bringup,
+                           struct hlist_node *node)
 {
+       struct cpuhp_step *sp = cpuhp_get_step(state);
        int ret;
 
-       if (!cb)
+       if ((bringup && !sp->startup.single) ||
+           (!bringup && !sp->teardown.single))
                return 0;
        /*
         * The non AP bound callbacks can fail on bringup. On teardown
@@ -1412,11 +1474,11 @@ static int cpuhp_issue_call(int cpu, enum cpuhp_state state,
         */
 #ifdef CONFIG_SMP
        if (cpuhp_is_ap_state(state))
-               ret = cpuhp_invoke_ap_callback(cpu, state, cb);
+               ret = cpuhp_invoke_ap_callback(cpu, state, bringup, node);
        else
-               ret = cpuhp_invoke_callback(cpu, state, cb);
+               ret = cpuhp_invoke_callback(cpu, state, bringup, node);
 #else
-       ret = cpuhp_invoke_callback(cpu, state, cb);
+       ret = cpuhp_invoke_callback(cpu, state, bringup, node);
 #endif
        BUG_ON(ret && !bringup);
        return ret;
@@ -1428,13 +1490,10 @@ static int cpuhp_issue_call(int cpu, enum cpuhp_state state,
  * Note: The teardown callbacks for rollback are not allowed to fail!
  */
 static void cpuhp_rollback_install(int failedcpu, enum cpuhp_state state,
-                                  int (*teardown)(unsigned int cpu))
+                                  struct hlist_node *node)
 {
        int cpu;
 
-       if (!teardown)
-               return;
-
        /* Roll back the already executed steps on the other cpus */
        for_each_present_cpu(cpu) {
                struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
@@ -1445,7 +1504,7 @@ static void cpuhp_rollback_install(int failedcpu, enum cpuhp_state state,
 
                /* Did we invoke the startup call on that cpu ? */
                if (cpustate >= state)
-                       cpuhp_issue_call(cpu, state, teardown, false);
+                       cpuhp_issue_call(cpu, state, false, node);
        }
 }
 
@@ -1472,6 +1531,52 @@ static int cpuhp_reserve_state(enum cpuhp_state state)
        return -ENOSPC;
 }
 
+int __cpuhp_state_add_instance(enum cpuhp_state state, struct hlist_node *node,
+                              bool invoke)
+{
+       struct cpuhp_step *sp;
+       int cpu;
+       int ret;
+
+       sp = cpuhp_get_step(state);
+       if (sp->multi_instance == false)
+               return -EINVAL;
+
+       get_online_cpus();
+
+       if (!invoke || !sp->startup.multi)
+               goto add_node;
+
+       /*
+        * Try to call the startup callback for each present cpu
+        * depending on the hotplug state of the cpu.
+        */
+       for_each_present_cpu(cpu) {
+               struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+               int cpustate = st->state;
+
+               if (cpustate < state)
+                       continue;
+
+               ret = cpuhp_issue_call(cpu, state, true, node);
+               if (ret) {
+                       if (sp->teardown.multi)
+                               cpuhp_rollback_install(cpu, state, node);
+                       goto err;
+               }
+       }
+add_node:
+       ret = 0;
+       mutex_lock(&cpuhp_state_mutex);
+       hlist_add_head(node, &sp->list);
+       mutex_unlock(&cpuhp_state_mutex);
+
+err:
+       put_online_cpus();
+       return ret;
+}
+EXPORT_SYMBOL_GPL(__cpuhp_state_add_instance);
+
 /**
  * __cpuhp_setup_state - Setup the callbacks for an hotplug machine state
  * @state:     The state to setup
@@ -1485,7 +1590,8 @@ static int cpuhp_reserve_state(enum cpuhp_state state)
 int __cpuhp_setup_state(enum cpuhp_state state,
                        const char *name, bool invoke,
                        int (*startup)(unsigned int cpu),
-                       int (*teardown)(unsigned int cpu))
+                       int (*teardown)(unsigned int cpu),
+                       bool multi_instance)
 {
        int cpu, ret = 0;
        int dyn_state = 0;
@@ -1504,7 +1610,7 @@ int __cpuhp_setup_state(enum cpuhp_state state,
                state = ret;
        }
 
-       cpuhp_store_callbacks(state, name, startup, teardown);
+       cpuhp_store_callbacks(state, name, startup, teardown, multi_instance);
 
        if (!invoke || !startup)
                goto out;
@@ -1520,10 +1626,11 @@ int __cpuhp_setup_state(enum cpuhp_state state,
                if (cpustate < state)
                        continue;
 
-               ret = cpuhp_issue_call(cpu, state, startup, true);
+               ret = cpuhp_issue_call(cpu, state, true, NULL);
                if (ret) {
-                       cpuhp_rollback_install(cpu, state, teardown);
-                       cpuhp_store_callbacks(state, NULL, NULL, NULL);
+                       if (teardown)
+                               cpuhp_rollback_install(cpu, state, NULL);
+                       cpuhp_store_callbacks(state, NULL, NULL, NULL, false);
                        goto out;
                }
        }
@@ -1535,6 +1642,42 @@ out:
 }
 EXPORT_SYMBOL(__cpuhp_setup_state);
 
+int __cpuhp_state_remove_instance(enum cpuhp_state state,
+                                 struct hlist_node *node, bool invoke)
+{
+       struct cpuhp_step *sp = cpuhp_get_step(state);
+       int cpu;
+
+       BUG_ON(cpuhp_cb_check(state));
+
+       if (!sp->multi_instance)
+               return -EINVAL;
+
+       get_online_cpus();
+       if (!invoke || !cpuhp_get_teardown_cb(state))
+               goto remove;
+       /*
+        * Call the teardown callback for each present cpu depending
+        * on the hotplug state of the cpu. This function is not
+        * allowed to fail currently!
+        */
+       for_each_present_cpu(cpu) {
+               struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+               int cpustate = st->state;
+
+               if (cpustate >= state)
+                       cpuhp_issue_call(cpu, state, false, node);
+       }
+
+remove:
+       mutex_lock(&cpuhp_state_mutex);
+       hlist_del(node);
+       mutex_unlock(&cpuhp_state_mutex);
+       put_online_cpus();
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(__cpuhp_state_remove_instance);
 /**
  * __cpuhp_remove_state - Remove the callbacks for an hotplug machine state
  * @state:     The state to remove
@@ -1546,14 +1689,21 @@ EXPORT_SYMBOL(__cpuhp_setup_state);
  */
 void __cpuhp_remove_state(enum cpuhp_state state, bool invoke)
 {
-       int (*teardown)(unsigned int cpu) = cpuhp_get_teardown_cb(state);
+       struct cpuhp_step *sp = cpuhp_get_step(state);
        int cpu;
 
        BUG_ON(cpuhp_cb_check(state));
 
        get_online_cpus();
 
-       if (!invoke || !teardown)
+       if (sp->multi_instance) {
+               WARN(!hlist_empty(&sp->list),
+                    "Error: Removing state %d which has instances left.\n",
+                    state);
+               goto remove;
+       }
+
+       if (!invoke || !cpuhp_get_teardown_cb(state))
                goto remove;
 
        /*
@@ -1566,10 +1716,10 @@ void __cpuhp_remove_state(enum cpuhp_state state, bool invoke)
                int cpustate = st->state;
 
                if (cpustate >= state)
-                       cpuhp_issue_call(cpu, state, teardown, false);
+                       cpuhp_issue_call(cpu, state, false, NULL);
        }
 remove:
-       cpuhp_store_callbacks(state, NULL, NULL, NULL);
+       cpuhp_store_callbacks(state, NULL, NULL, NULL, false);
        put_online_cpus();
 }
 EXPORT_SYMBOL(__cpuhp_remove_state);
index a7b8c1c75fa71c057b504d249bfeff0161f0cd1f..4a4a94774dfa3ed5e402b13d83d506596251ff41 100644 (file)
@@ -1475,8 +1475,7 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
        if (event->group_leader == event) {
                struct list_head *list;
 
-               if (is_software_event(event))
-                       event->group_flags |= PERF_GROUP_SOFTWARE;
+               event->group_caps = event->event_caps;
 
                list = ctx_group_list(event, ctx);
                list_add_tail(&event->group_entry, list);
@@ -1630,9 +1629,7 @@ static void perf_group_attach(struct perf_event *event)
 
        WARN_ON_ONCE(group_leader->ctx != event->ctx);
 
-       if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
-                       !is_software_event(event))
-               group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;
+       group_leader->group_caps &= event->event_caps;
 
        list_add_tail(&event->group_entry, &group_leader->sibling_list);
        group_leader->nr_siblings++;
@@ -1723,7 +1720,7 @@ static void perf_group_detach(struct perf_event *event)
                sibling->group_leader = sibling;
 
                /* Inherit group flags from the previous leader */
-               sibling->group_flags = event->group_flags;
+               sibling->group_caps = event->group_caps;
 
                WARN_ON_ONCE(sibling->ctx != event->ctx);
        }
@@ -1832,6 +1829,8 @@ group_sched_out(struct perf_event *group_event,
        struct perf_event *event;
        int state = group_event->state;
 
+       perf_pmu_disable(ctx->pmu);
+
        event_sched_out(group_event, cpuctx, ctx);
 
        /*
@@ -1840,6 +1839,8 @@ group_sched_out(struct perf_event *group_event,
        list_for_each_entry(event, &group_event->sibling_list, group_entry)
                event_sched_out(event, cpuctx, ctx);
 
+       perf_pmu_enable(ctx->pmu);
+
        if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive)
                cpuctx->exclusive = 0;
 }
@@ -2145,7 +2146,7 @@ static int group_can_go_on(struct perf_event *event,
        /*
         * Groups consisting entirely of software events can always go on.
         */
-       if (event->group_flags & PERF_GROUP_SOFTWARE)
+       if (event->group_caps & PERF_EV_CAP_SOFTWARE)
                return 1;
        /*
         * If an exclusive group is already on, no other hardware
@@ -2491,16 +2492,16 @@ static int __perf_event_stop(void *info)
         * while restarting.
         */
        if (sd->restart)
-               event->pmu->start(event, PERF_EF_START);
+               event->pmu->start(event, 0);
 
        return 0;
 }
 
-static int perf_event_restart(struct perf_event *event)
+static int perf_event_stop(struct perf_event *event, int restart)
 {
        struct stop_event_data sd = {
                .event          = event,
-               .restart        = 1,
+               .restart        = restart,
        };
        int ret = 0;
 
@@ -2837,19 +2838,36 @@ unlock:
        }
 }
 
+static DEFINE_PER_CPU(struct list_head, sched_cb_list);
+
 void perf_sched_cb_dec(struct pmu *pmu)
 {
+       struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+
        this_cpu_dec(perf_sched_cb_usages);
+
+       if (!--cpuctx->sched_cb_usage)
+               list_del(&cpuctx->sched_cb_entry);
 }
 
+
 void perf_sched_cb_inc(struct pmu *pmu)
 {
+       struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+
+       if (!cpuctx->sched_cb_usage++)
+               list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
+
        this_cpu_inc(perf_sched_cb_usages);
 }
 
 /*
  * This function provides the context switch callback to the lower code
  * layer. It is invoked ONLY when the context switch callback is enabled.
+ *
+ * This callback is relevant even to per-cpu events; for example multi event
+ * PEBS requires this to provide PID/TID information. This requires we flush
+ * all queued PEBS records before we context switch to a new task.
  */
 static void perf_pmu_sched_task(struct task_struct *prev,
                                struct task_struct *next,
@@ -2857,34 +2875,24 @@ static void perf_pmu_sched_task(struct task_struct *prev,
 {
        struct perf_cpu_context *cpuctx;
        struct pmu *pmu;
-       unsigned long flags;
 
        if (prev == next)
                return;
 
-       local_irq_save(flags);
-
-       rcu_read_lock();
-
-       list_for_each_entry_rcu(pmu, &pmus, entry) {
-               if (pmu->sched_task) {
-                       cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
-
-                       perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+       list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) {
+               pmu = cpuctx->unique_pmu; /* software PMUs will not have sched_task */
 
-                       perf_pmu_disable(pmu);
+               if (WARN_ON_ONCE(!pmu->sched_task))
+                       continue;
 
-                       pmu->sched_task(cpuctx->task_ctx, sched_in);
+               perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+               perf_pmu_disable(pmu);
 
-                       perf_pmu_enable(pmu);
+               pmu->sched_task(cpuctx->task_ctx, sched_in);
 
-                       perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
-               }
+               perf_pmu_enable(pmu);
+               perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
        }
-
-       rcu_read_unlock();
-
-       local_irq_restore(flags);
 }
 
 static void perf_event_switch(struct task_struct *task,
@@ -3416,6 +3424,22 @@ struct perf_read_data {
        int ret;
 };
 
+static int find_cpu_to_read(struct perf_event *event, int local_cpu)
+{
+       int event_cpu = event->oncpu;
+       u16 local_pkg, event_pkg;
+
+       if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) {
+               event_pkg =  topology_physical_package_id(event_cpu);
+               local_pkg =  topology_physical_package_id(local_cpu);
+
+               if (event_pkg == local_pkg)
+                       return local_cpu;
+       }
+
+       return event_cpu;
+}
+
 /*
  * Cross CPU call to read the hardware event
  */
@@ -3537,7 +3561,7 @@ u64 perf_event_read_local(struct perf_event *event)
 
 static int perf_event_read(struct perf_event *event, bool group)
 {
-       int ret = 0;
+       int ret = 0, cpu_to_read, local_cpu;
 
        /*
         * If event is enabled and currently active on a CPU, update the
@@ -3549,10 +3573,23 @@ static int perf_event_read(struct perf_event *event, bool group)
                        .group = group,
                        .ret = 0,
                };
-               ret = smp_call_function_single(event->oncpu, __perf_event_read, &data, 1);
-               /* The event must have been read from an online CPU: */
-               WARN_ON_ONCE(ret);
-               ret = ret ? : data.ret;
+
+               local_cpu = get_cpu();
+               cpu_to_read = find_cpu_to_read(event, local_cpu);
+               put_cpu();
+
+               /*
+                * Purposely ignore the smp_call_function_single() return
+                * value.
+                *
+                * If event->oncpu isn't a valid CPU it means the event got
+                * scheduled out and that will have updated the event count.
+                *
+                * Therefore, either way, we'll have an up-to-date event count
+                * after this.
+                */
+               (void)smp_call_function_single(cpu_to_read, __perf_event_read, &data, 1);
+               ret = data.ret;
        } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
                struct perf_event_context *ctx = event->ctx;
                unsigned long flags;
@@ -4837,6 +4874,19 @@ static void ring_buffer_attach(struct perf_event *event,
                spin_unlock_irqrestore(&rb->event_lock, flags);
        }
 
+       /*
+        * Avoid racing with perf_mmap_close(AUX): stop the event
+        * before swizzling the event::rb pointer; if it's getting
+        * unmapped, its aux_mmap_count will be 0 and it won't
+        * restart. See the comment in __perf_pmu_output_stop().
+        *
+        * Data will inevitably be lost when set_output is done in
+        * mid-air, but then again, whoever does it like this is
+        * not in for the data anyway.
+        */
+       if (has_aux(event))
+               perf_event_stop(event, 0);
+
        rcu_assign_pointer(event->rb, rb);
 
        if (old_rb) {
@@ -5329,9 +5379,10 @@ perf_output_sample_regs(struct perf_output_handle *handle,
                        struct pt_regs *regs, u64 mask)
 {
        int bit;
+       DECLARE_BITMAP(_mask, 64);
 
-       for_each_set_bit(bit, (const unsigned long *) &mask,
-                        sizeof(mask) * BITS_PER_BYTE) {
+       bitmap_from_u64(_mask, mask);
+       for_each_set_bit(bit, _mask, sizeof(mask) * BITS_PER_BYTE) {
                u64 val;
 
                val = perf_reg_value(regs, bit);
@@ -6112,7 +6163,7 @@ static void perf_event_addr_filters_exec(struct perf_event *event, void *data)
        raw_spin_unlock_irqrestore(&ifh->lock, flags);
 
        if (restart)
-               perf_event_restart(event);
+               perf_event_stop(event, 1);
 }
 
 void perf_event_exec(void)
@@ -6156,7 +6207,13 @@ static void __perf_event_output_stop(struct perf_event *event, void *data)
 
        /*
         * In case of inheritance, it will be the parent that links to the
-        * ring-buffer, but it will be the child that's actually using it:
+        * ring-buffer, but it will be the child that's actually using it.
+        *
+        * We are using event::rb to determine if the event should be stopped,
+        * however this may race with ring_buffer_attach() (through set_output),
+        * which will make us skip the event that actually needs to be stopped.
+        * So ring_buffer_attach() has to stop an aux event before re-assigning
+        * its rb pointer.
         */
        if (rcu_dereference(parent->rb) == rb)
                ro->err = __perf_event_stop(&sd);
@@ -6670,7 +6727,7 @@ static void __perf_addr_filters_adjust(struct perf_event *event, void *data)
        raw_spin_unlock_irqrestore(&ifh->lock, flags);
 
        if (restart)
-               perf_event_restart(event);
+               perf_event_stop(event, 1);
 }
 
 /*
@@ -7933,7 +7990,7 @@ static void perf_event_addr_filters_apply(struct perf_event *event)
        mmput(mm);
 
 restart:
-       perf_event_restart(event);
+       perf_event_stop(event, 1);
 }
 
 /*
@@ -9565,6 +9622,9 @@ SYSCALL_DEFINE5(perf_event_open,
                        goto err_alloc;
        }
 
+       if (pmu->task_ctx_nr == perf_sw_context)
+               event->event_caps |= PERF_EV_CAP_SOFTWARE;
+
        if (group_leader &&
            (is_software_event(event) != is_software_event(group_leader))) {
                if (is_software_event(event)) {
@@ -9578,7 +9638,7 @@ SYSCALL_DEFINE5(perf_event_open,
                         */
                        pmu = group_leader->pmu;
                } else if (is_software_event(group_leader) &&
-                          (group_leader->group_flags & PERF_GROUP_SOFTWARE)) {
+                          (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
                        /*
                         * In case the group is a pure software group, and we
                         * try to add a hardware event, move the whole group to
@@ -10513,6 +10573,8 @@ static void __init perf_event_init_all_cpus(void)
 
                INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu));
                raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu));
+
+               INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu));
        }
 }
 
index ae9b90dc9a5a66c74134d4464665ee05c125ea2b..257fa460b846032744e2da500d489d0995678571 100644 (file)
@@ -330,15 +330,22 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
        if (!rb)
                return NULL;
 
-       if (!rb_has_aux(rb) || !atomic_inc_not_zero(&rb->aux_refcount))
+       if (!rb_has_aux(rb))
                goto err;
 
        /*
-        * If rb::aux_mmap_count is zero (and rb_has_aux() above went through),
-        * the aux buffer is in perf_mmap_close(), about to get freed.
+        * If aux_mmap_count is zero, the aux buffer is in perf_mmap_close(),
+        * about to get freed, so we leave immediately.
+        *
+        * Checking rb::aux_mmap_count and rb::refcount has to be done in
+        * the same order, see perf_mmap_close. Otherwise we end up freeing
+        * aux pages in this path, which is a bug, because in_atomic().
         */
        if (!atomic_read(&rb->aux_mmap_count))
-               goto err_put;
+               goto err;
+
+       if (!atomic_inc_not_zero(&rb->aux_refcount))
+               goto err;
 
        /*
         * Nesting is not supported for AUX area, make sure nested
index 8c50276b60d1c7fb75da997f85819f5b9a313c27..d4129bb05e5d044101ba2cbea672f96954b69a51 100644 (file)
@@ -150,7 +150,7 @@ static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr)
  * Returns 0 on success, -EFAULT on failure.
  */
 static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
-                               struct page *page, struct page *kpage)
+                               struct page *old_page, struct page *new_page)
 {
        struct mm_struct *mm = vma->vm_mm;
        spinlock_t *ptl;
@@ -161,49 +161,49 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
        const unsigned long mmun_end   = addr + PAGE_SIZE;
        struct mem_cgroup *memcg;
 
-       err = mem_cgroup_try_charge(kpage, vma->vm_mm, GFP_KERNEL, &memcg,
+       err = mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL, &memcg,
                        false);
        if (err)
                return err;
 
        /* For try_to_free_swap() and munlock_vma_page() below */
-       lock_page(page);
+       lock_page(old_page);
 
        mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
        err = -EAGAIN;
-       ptep = page_check_address(page, mm, addr, &ptl, 0);
+       ptep = page_check_address(old_page, mm, addr, &ptl, 0);
        if (!ptep) {
-               mem_cgroup_cancel_charge(kpage, memcg, false);
+               mem_cgroup_cancel_charge(new_page, memcg, false);
                goto unlock;
        }
 
-       get_page(kpage);
-       page_add_new_anon_rmap(kpage, vma, addr, false);
-       mem_cgroup_commit_charge(kpage, memcg, false, false);
-       lru_cache_add_active_or_unevictable(kpage, vma);
+       get_page(new_page);
+       page_add_new_anon_rmap(new_page, vma, addr, false);
+       mem_cgroup_commit_charge(new_page, memcg, false, false);
+       lru_cache_add_active_or_unevictable(new_page, vma);
 
-       if (!PageAnon(page)) {
-               dec_mm_counter(mm, mm_counter_file(page));
+       if (!PageAnon(old_page)) {
+               dec_mm_counter(mm, mm_counter_file(old_page));
                inc_mm_counter(mm, MM_ANONPAGES);
        }
 
        flush_cache_page(vma, addr, pte_pfn(*ptep));
        ptep_clear_flush_notify(vma, addr, ptep);
-       set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
+       set_pte_at_notify(mm, addr, ptep, mk_pte(new_page, vma->vm_page_prot));
 
-       page_remove_rmap(page, false);
-       if (!page_mapped(page))
-               try_to_free_swap(page);
+       page_remove_rmap(old_page, false);
+       if (!page_mapped(old_page))
+               try_to_free_swap(old_page);
        pte_unmap_unlock(ptep, ptl);
 
        if (vma->vm_flags & VM_LOCKED)
-               munlock_vma_page(page);
-       put_page(page);
+               munlock_vma_page(old_page);
+       put_page(old_page);
 
        err = 0;
  unlock:
        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
-       unlock_page(page);
+       unlock_page(old_page);
        return err;
 }
 
index 05393881ef399ac365b6fa522864875a1e55b6f7..c2ecca44406b130e0ec7abc4d2648beff6738814 100644 (file)
@@ -158,19 +158,39 @@ void __weak arch_release_thread_stack(unsigned long *stack)
  * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
  * kmemcache based allocator.
  */
-# if THREAD_SIZE >= PAGE_SIZE
-static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
-                                                 int node)
+# if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)
+static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
 {
+#ifdef CONFIG_VMAP_STACK
+       void *stack = __vmalloc_node_range(THREAD_SIZE, THREAD_SIZE,
+                                          VMALLOC_START, VMALLOC_END,
+                                          THREADINFO_GFP | __GFP_HIGHMEM,
+                                          PAGE_KERNEL,
+                                          0, node,
+                                          __builtin_return_address(0));
+
+       /*
+        * We can't call find_vm_area() in interrupt context, and
+        * free_thread_stack() can be called in interrupt context,
+        * so cache the vm_struct.
+        */
+       if (stack)
+               tsk->stack_vm_area = find_vm_area(stack);
+       return stack;
+#else
        struct page *page = alloc_pages_node(node, THREADINFO_GFP,
                                             THREAD_SIZE_ORDER);
 
        return page ? page_address(page) : NULL;
+#endif
 }
 
-static inline void free_thread_stack(unsigned long *stack)
+static inline void free_thread_stack(struct task_struct *tsk)
 {
-       __free_pages(virt_to_page(stack), THREAD_SIZE_ORDER);
+       if (task_stack_vm_area(tsk))
+               vfree(tsk->stack);
+       else
+               __free_pages(virt_to_page(tsk->stack), THREAD_SIZE_ORDER);
 }
 # else
 static struct kmem_cache *thread_stack_cache;
@@ -181,9 +201,9 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
        return kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node);
 }
 
-static void free_thread_stack(unsigned long *stack)
+static void free_thread_stack(struct task_struct *tsk)
 {
-       kmem_cache_free(thread_stack_cache, stack);
+       kmem_cache_free(thread_stack_cache, tsk->stack);
 }
 
 void thread_stack_cache_init(void)
@@ -213,24 +233,47 @@ struct kmem_cache *vm_area_cachep;
 /* SLAB cache for mm_struct structures (tsk->mm) */
 static struct kmem_cache *mm_cachep;
 
-static void account_kernel_stack(unsigned long *stack, int account)
+static void account_kernel_stack(struct task_struct *tsk, int account)
 {
-       /* All stack pages are in the same zone and belong to the same memcg. */
-       struct page *first_page = virt_to_page(stack);
+       void *stack = task_stack_page(tsk);
+       struct vm_struct *vm = task_stack_vm_area(tsk);
+
+       BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0);
+
+       if (vm) {
+               int i;
+
+               BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);
+
+               for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
+                       mod_zone_page_state(page_zone(vm->pages[i]),
+                                           NR_KERNEL_STACK_KB,
+                                           PAGE_SIZE / 1024 * account);
+               }
 
-       mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB,
-                           THREAD_SIZE / 1024 * account);
+               /* All stack pages belong to the same memcg. */
+               memcg_kmem_update_page_stat(vm->pages[0], MEMCG_KERNEL_STACK_KB,
+                                           account * (THREAD_SIZE / 1024));
+       } else {
+               /*
+                * All stack pages are in the same zone and belong to the
+                * same memcg.
+                */
+               struct page *first_page = virt_to_page(stack);
+
+               mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB,
+                                   THREAD_SIZE / 1024 * account);
 
-       memcg_kmem_update_page_stat(
-               first_page, MEMCG_KERNEL_STACK_KB,
-               account * (THREAD_SIZE / 1024));
+               memcg_kmem_update_page_stat(first_page, MEMCG_KERNEL_STACK_KB,
+                                           account * (THREAD_SIZE / 1024));
+       }
 }
 
 void free_task(struct task_struct *tsk)
 {
-       account_kernel_stack(tsk->stack, -1);
+       account_kernel_stack(tsk, -1);
        arch_release_thread_stack(tsk->stack);
-       free_thread_stack(tsk->stack);
+       free_thread_stack(tsk);
        rt_mutex_debug_task_free(tsk);
        ftrace_graph_exit_task(tsk);
        put_seccomp_filter(tsk);
@@ -342,6 +385,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
 {
        struct task_struct *tsk;
        unsigned long *stack;
+       struct vm_struct *stack_vm_area;
        int err;
 
        if (node == NUMA_NO_NODE)
@@ -354,11 +398,23 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
        if (!stack)
                goto free_tsk;
 
+       stack_vm_area = task_stack_vm_area(tsk);
+
        err = arch_dup_task_struct(tsk, orig);
+
+       /*
+        * arch_dup_task_struct() clobbers the stack-related fields.  Make
+        * sure they're properly initialized before using any stack-related
+        * functions again.
+        */
+       tsk->stack = stack;
+#ifdef CONFIG_VMAP_STACK
+       tsk->stack_vm_area = stack_vm_area;
+#endif
+
        if (err)
                goto free_stack;
 
-       tsk->stack = stack;
 #ifdef CONFIG_SECCOMP
        /*
         * We must handle setting up seccomp filters once we're under
@@ -390,14 +446,14 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
        tsk->task_frag.page = NULL;
        tsk->wake_q.next = NULL;
 
-       account_kernel_stack(stack, 1);
+       account_kernel_stack(tsk, 1);
 
        kcov_task_init(tsk);
 
        return tsk;
 
 free_stack:
-       free_thread_stack(stack);
+       free_thread_stack(tsk);
 free_tsk:
        free_task_struct(tsk);
        return NULL;
index 46cb3a301bc1555a84607bab7b2aea8a2bcaf7e6..2c4be467fecdcb69e4cf77e84a0af71034513677 100644 (file)
@@ -381,8 +381,12 @@ static inline int hb_waiters_pending(struct futex_hash_bucket *hb)
 #endif
 }
 
-/*
- * We hash on the keys returned from get_futex_key (see below).
+/**
+ * hash_futex - Return the hash bucket in the global hash
+ * @key:       Pointer to the futex key for which the hash is calculated
+ *
+ * We hash on the keys returned from get_futex_key (see below) and return the
+ * corresponding hash bucket in the global hash.
  */
 static struct futex_hash_bucket *hash_futex(union futex_key *key)
 {
@@ -392,7 +396,12 @@ static struct futex_hash_bucket *hash_futex(union futex_key *key)
        return &futex_queues[hash & (futex_hashsize - 1)];
 }
 
-/*
+
+/**
+ * match_futex - Check whether two futex keys are equal
+ * @key1:      Pointer to key1
+ * @key2:      Pointer to key2
+ *
  * Return 1 if two futex_keys are equal, 0 otherwise.
  */
 static inline int match_futex(union futex_key *key1, union futex_key *key2)
index d234022805dcb5c5b8b169c92fde5956b5b0ce79..432c3d71d19536b0f468ec84ecc15d2300752103 100644 (file)
@@ -117,7 +117,7 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
        pr_err("\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
                " disables this message.\n");
        sched_show_task(t);
-       debug_show_held_locks(t);
+       debug_show_all_locks();
 
        touch_nmi_watchdog();
 
index 637389088b3f9cf88db58ce920350a3517a20ed2..18f29586f230e4d534eabcb9e387d2de38ddc948 100644 (file)
@@ -76,7 +76,6 @@ int irq_set_irq_type(unsigned int irq, unsigned int type)
        if (!desc)
                return -EINVAL;
 
-       type &= IRQ_TYPE_SENSE_MASK;
        ret = __irq_set_trigger(desc, type);
        irq_put_desc_busunlock(desc, flags);
        return ret;
@@ -756,7 +755,6 @@ void handle_percpu_devid_irq(struct irq_desc *desc)
 {
        struct irq_chip *chip = irq_desc_get_chip(desc);
        struct irqaction *action = desc->action;
-       void *dev_id = raw_cpu_ptr(action->percpu_dev_id);
        unsigned int irq = irq_desc_get_irq(desc);
        irqreturn_t res;
 
@@ -765,9 +763,20 @@ void handle_percpu_devid_irq(struct irq_desc *desc)
        if (chip->irq_ack)
                chip->irq_ack(&desc->irq_data);
 
-       trace_irq_handler_entry(irq, action);
-       res = action->handler(irq, dev_id);
-       trace_irq_handler_exit(irq, action, res);
+       if (likely(action)) {
+               trace_irq_handler_entry(irq, action);
+               res = action->handler(irq, raw_cpu_ptr(action->percpu_dev_id));
+               trace_irq_handler_exit(irq, action, res);
+       } else {
+               unsigned int cpu = smp_processor_id();
+               bool enabled = cpumask_test_cpu(cpu, desc->percpu_enabled);
+
+               if (enabled)
+                       irq_percpu_disable(desc, cpu);
+
+               pr_err_once("Spurious%s percpu IRQ%u on CPU%u\n",
+                           enabled ? " and unmasked" : "", irq, cpu);
+       }
 
        if (chip->irq_eoi)
                chip->irq_eoi(&desc->irq_data);
index abd286afbd2732544e54a4653053a4d9ce785286..ee32870079c9c9a414f4cb7a72efba1743d73cf9 100644 (file)
@@ -260,9 +260,9 @@ irq_gc_init_mask_cache(struct irq_chip_generic *gc, enum irq_gc_flags flags)
 }
 
 /**
- * irq_alloc_domain_generic_chip - Allocate generic chips for an irq domain
+ * __irq_alloc_domain_generic_chip - Allocate generic chips for an irq domain
  * @d:                 irq domain for which to allocate chips
- * @irqs_per_chip:     Number of interrupts each chip handles
+ * @irqs_per_chip:     Number of interrupts each chip handles (max 32)
  * @num_ct:            Number of irq_chip_type instances associated with this
  * @name:              Name of the irq chip
  * @handler:           Default flow handler associated with these chips
@@ -270,11 +270,11 @@ irq_gc_init_mask_cache(struct irq_chip_generic *gc, enum irq_gc_flags flags)
  * @set:               IRQ_* bits to set in the mapping function
  * @gcflags:           Generic chip specific setup flags
  */
-int irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip,
-                                  int num_ct, const char *name,
-                                  irq_flow_handler_t handler,
-                                  unsigned int clr, unsigned int set,
-                                  enum irq_gc_flags gcflags)
+int __irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip,
+                                    int num_ct, const char *name,
+                                    irq_flow_handler_t handler,
+                                    unsigned int clr, unsigned int set,
+                                    enum irq_gc_flags gcflags)
 {
        struct irq_domain_chip_generic *dgc;
        struct irq_chip_generic *gc;
@@ -326,7 +326,21 @@ int irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip,
        d->name = name;
        return 0;
 }
-EXPORT_SYMBOL_GPL(irq_alloc_domain_generic_chips);
+EXPORT_SYMBOL_GPL(__irq_alloc_domain_generic_chips);
+
+static struct irq_chip_generic *
+__irq_get_domain_generic_chip(struct irq_domain *d, unsigned int hw_irq)
+{
+       struct irq_domain_chip_generic *dgc = d->gc;
+       int idx;
+
+       if (!dgc)
+               return ERR_PTR(-ENODEV);
+       idx = hw_irq / dgc->irqs_per_chip;
+       if (idx >= dgc->num_chips)
+               return ERR_PTR(-EINVAL);
+       return dgc->gc[idx];
+}
 
 /**
  * irq_get_domain_generic_chip - Get a pointer to the generic chip of a hw_irq
@@ -336,15 +350,9 @@ EXPORT_SYMBOL_GPL(irq_alloc_domain_generic_chips);
 struct irq_chip_generic *
 irq_get_domain_generic_chip(struct irq_domain *d, unsigned int hw_irq)
 {
-       struct irq_domain_chip_generic *dgc = d->gc;
-       int idx;
+       struct irq_chip_generic *gc = __irq_get_domain_generic_chip(d, hw_irq);
 
-       if (!dgc)
-               return NULL;
-       idx = hw_irq / dgc->irqs_per_chip;
-       if (idx >= dgc->num_chips)
-               return NULL;
-       return dgc->gc[idx];
+       return !IS_ERR(gc) ? gc : NULL;
 }
 EXPORT_SYMBOL_GPL(irq_get_domain_generic_chip);
 
@@ -368,13 +376,9 @@ int irq_map_generic_chip(struct irq_domain *d, unsigned int virq,
        unsigned long flags;
        int idx;
 
-       if (!d->gc)
-               return -ENODEV;
-
-       idx = hw_irq / dgc->irqs_per_chip;
-       if (idx >= dgc->num_chips)
-               return -EINVAL;
-       gc = dgc->gc[idx];
+       gc = __irq_get_domain_generic_chip(d, hw_irq);
+       if (IS_ERR(gc))
+               return PTR_ERR(gc);
 
        idx = hw_irq % dgc->irqs_per_chip;
 
@@ -409,10 +413,30 @@ int irq_map_generic_chip(struct irq_domain *d, unsigned int virq,
        irq_modify_status(virq, dgc->irq_flags_to_clear, dgc->irq_flags_to_set);
        return 0;
 }
-EXPORT_SYMBOL_GPL(irq_map_generic_chip);
+
+static void irq_unmap_generic_chip(struct irq_domain *d, unsigned int virq)
+{
+       struct irq_data *data = irq_domain_get_irq_data(d, virq);
+       struct irq_domain_chip_generic *dgc = d->gc;
+       unsigned int hw_irq = data->hwirq;
+       struct irq_chip_generic *gc;
+       int irq_idx;
+
+       gc = irq_get_domain_generic_chip(d, hw_irq);
+       if (!gc)
+               return;
+
+       irq_idx = hw_irq % dgc->irqs_per_chip;
+
+       clear_bit(irq_idx, &gc->installed);
+       irq_domain_set_info(d, virq, hw_irq, &no_irq_chip, NULL, NULL, NULL,
+                           NULL);
+
+}
 
 struct irq_domain_ops irq_generic_chip_ops = {
        .map    = irq_map_generic_chip,
+       .unmap  = irq_unmap_generic_chip,
        .xlate  = irq_domain_xlate_onetwocell,
 };
 EXPORT_SYMBOL_GPL(irq_generic_chip_ops);
index 4752b43662e0017c343e3f3fa6dd013c23072778..8c0a0ae43521c7f8b9e97964912cc3ab5e10fd15 100644 (file)
@@ -80,7 +80,7 @@ EXPORT_SYMBOL_GPL(irq_domain_free_fwnode);
 
 /**
  * __irq_domain_add() - Allocate a new irq_domain data structure
- * @of_node: optional device-tree node of the interrupt controller
+ * @fwnode: firmware node for the interrupt controller
  * @size: Size of linear map; 0 for radix mapping only
  * @hwirq_max: Maximum number of interrupts supported by controller
  * @direct_max: Maximum value of direct maps; Use ~0 for no limit; 0 for no
@@ -96,10 +96,8 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size,
                                    const struct irq_domain_ops *ops,
                                    void *host_data)
 {
+       struct device_node *of_node = to_of_node(fwnode);
        struct irq_domain *domain;
-       struct device_node *of_node;
-
-       of_node = to_of_node(fwnode);
 
        domain = kzalloc_node(sizeof(*domain) + (sizeof(unsigned int) * size),
                              GFP_KERNEL, of_node_to_nid(of_node));
@@ -868,7 +866,10 @@ int irq_domain_xlate_onetwocell(struct irq_domain *d,
        if (WARN_ON(intsize < 1))
                return -EINVAL;
        *out_hwirq = intspec[0];
-       *out_type = (intsize > 1) ? intspec[1] : IRQ_TYPE_NONE;
+       if (intsize > 1)
+               *out_type = intspec[1] & IRQ_TYPE_SENSE_MASK;
+       else
+               *out_type = IRQ_TYPE_NONE;
        return 0;
 }
 EXPORT_SYMBOL_GPL(irq_domain_xlate_onetwocell);
index 9530fcd27704008a9531c49d623910c707d681ce..0c5f1a5db654c7da7db4ec168ef44516407d6af5 100644 (file)
@@ -669,8 +669,6 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned long flags)
                return 0;
        }
 
-       flags &= IRQ_TYPE_SENSE_MASK;
-
        if (chip->flags & IRQCHIP_SET_TYPE_MASKED) {
                if (!irqd_irq_masked(&desc->irq_data))
                        mask_irq(desc);
@@ -678,7 +676,8 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned long flags)
                        unmask = 1;
        }
 
-       /* caller masked out all except trigger mode flags */
+       /* Mask all flags except trigger mode */
+       flags &= IRQ_TYPE_SENSE_MASK;
        ret = chip->irq_set_type(&desc->irq_data, flags);
 
        switch (ret) {
index bec0b647f9cc5291df0211c6532edd5ffe167eb2..ce182599cf2e98b51831adbf5dca6ce545df0d7f 100644 (file)
 #include <linux/sched.h>
 #include <linux/errno.h>
 
-int __percpu_init_rwsem(struct percpu_rw_semaphore *brw,
+int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
                        const char *name, struct lock_class_key *rwsem_key)
 {
-       brw->fast_read_ctr = alloc_percpu(int);
-       if (unlikely(!brw->fast_read_ctr))
+       sem->read_count = alloc_percpu(int);
+       if (unlikely(!sem->read_count))
                return -ENOMEM;
 
        /* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */
-       __init_rwsem(&brw->rw_sem, name, rwsem_key);
-       rcu_sync_init(&brw->rss, RCU_SCHED_SYNC);
-       atomic_set(&brw->slow_read_ctr, 0);
-       init_waitqueue_head(&brw->write_waitq);
+       rcu_sync_init(&sem->rss, RCU_SCHED_SYNC);
+       __init_rwsem(&sem->rw_sem, name, rwsem_key);
+       init_waitqueue_head(&sem->writer);
+       sem->readers_block = 0;
        return 0;
 }
 EXPORT_SYMBOL_GPL(__percpu_init_rwsem);
 
-void percpu_free_rwsem(struct percpu_rw_semaphore *brw)
+void percpu_free_rwsem(struct percpu_rw_semaphore *sem)
 {
        /*
         * XXX: temporary kludge. The error path in alloc_super()
         * assumes that percpu_free_rwsem() is safe after kzalloc().
         */
-       if (!brw->fast_read_ctr)
+       if (!sem->read_count)
                return;
 
-       rcu_sync_dtor(&brw->rss);
-       free_percpu(brw->fast_read_ctr);
-       brw->fast_read_ctr = NULL; /* catch use after free bugs */
+       rcu_sync_dtor(&sem->rss);
+       free_percpu(sem->read_count);
+       sem->read_count = NULL; /* catch use after free bugs */
 }
 EXPORT_SYMBOL_GPL(percpu_free_rwsem);
 
-/*
- * This is the fast-path for down_read/up_read. If it succeeds we rely
- * on the barriers provided by rcu_sync_enter/exit; see the comments in
- * percpu_down_write() and percpu_up_write().
- *
- * If this helper fails the callers rely on the normal rw_semaphore and
- * atomic_dec_and_test(), so in this case we have the necessary barriers.
- */
-static bool update_fast_ctr(struct percpu_rw_semaphore *brw, unsigned int val)
+int __percpu_down_read(struct percpu_rw_semaphore *sem, int try)
 {
-       bool success;
+       /*
+        * Due to having preemption disabled the decrement happens on
+        * the same CPU as the increment, avoiding the
+        * increment-on-one-CPU-and-decrement-on-another problem.
+        *
+        * If the reader misses the writer's assignment of readers_block, then
+        * the writer is guaranteed to see the reader's increment.
+        *
+        * Conversely, any readers that increment their sem->read_count after
+        * the writer looks are guaranteed to see the readers_block value,
+        * which in turn means that they are guaranteed to immediately
+        * decrement their sem->read_count, so that it doesn't matter that the
+        * writer missed them.
+        */
 
-       preempt_disable();
-       success = rcu_sync_is_idle(&brw->rss);
-       if (likely(success))
-               __this_cpu_add(*brw->fast_read_ctr, val);
-       preempt_enable();
+       smp_mb(); /* A matches D */
 
-       return success;
-}
+       /*
+        * If !readers_block the critical section starts here, matched by the
+        * release in percpu_up_write().
+        */
+       if (likely(!smp_load_acquire(&sem->readers_block)))
+               return 1;
 
-/*
- * Like the normal down_read() this is not recursive, the writer can
- * come after the first percpu_down_read() and create the deadlock.
- *
- * Note: returns with lock_is_held(brw->rw_sem) == T for lockdep,
- * percpu_up_read() does rwsem_release(). This pairs with the usage
- * of ->rw_sem in percpu_down/up_write().
- */
-void percpu_down_read(struct percpu_rw_semaphore *brw)
-{
-       might_sleep();
-       rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 0, _RET_IP_);
+       /*
+        * Per the above comment; we still have preemption disabled and
+        * will thus decrement on the same CPU as we incremented.
+        */
+       __percpu_up_read(sem);
 
-       if (likely(update_fast_ctr(brw, +1)))
-               return;
+       if (try)
+               return 0;
 
-       /* Avoid rwsem_acquire_read() and rwsem_release() */
-       __down_read(&brw->rw_sem);
-       atomic_inc(&brw->slow_read_ctr);
-       __up_read(&brw->rw_sem);
-}
-EXPORT_SYMBOL_GPL(percpu_down_read);
+       /*
+        * We either call schedule() in the wait, or we'll fall through
+        * and reschedule on the preempt_enable() in percpu_down_read().
+        */
+       preempt_enable_no_resched();
 
-int percpu_down_read_trylock(struct percpu_rw_semaphore *brw)
-{
-       if (unlikely(!update_fast_ctr(brw, +1))) {
-               if (!__down_read_trylock(&brw->rw_sem))
-                       return 0;
-               atomic_inc(&brw->slow_read_ctr);
-               __up_read(&brw->rw_sem);
-       }
-
-       rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 1, _RET_IP_);
+       /*
+        * Avoid lockdep for the down/up_read() we already have them.
+        */
+       __down_read(&sem->rw_sem);
+       this_cpu_inc(*sem->read_count);
+       __up_read(&sem->rw_sem);
+
+       preempt_disable();
        return 1;
 }
+EXPORT_SYMBOL_GPL(__percpu_down_read);
 
-void percpu_up_read(struct percpu_rw_semaphore *brw)
+void __percpu_up_read(struct percpu_rw_semaphore *sem)
 {
-       rwsem_release(&brw->rw_sem.dep_map, 1, _RET_IP_);
-
-       if (likely(update_fast_ctr(brw, -1)))
-               return;
+       smp_mb(); /* B matches C */
+       /*
+        * In other words, if they see our decrement (presumably to aggregate
+        * zero, as that is the only time it matters) they will also see our
+        * critical section.
+        */
+       __this_cpu_dec(*sem->read_count);
 
-       /* false-positive is possible but harmless */
-       if (atomic_dec_and_test(&brw->slow_read_ctr))
-               wake_up_all(&brw->write_waitq);
+       /* Prod writer to recheck readers_active */
+       wake_up(&sem->writer);
 }
-EXPORT_SYMBOL_GPL(percpu_up_read);
+EXPORT_SYMBOL_GPL(__percpu_up_read);
+
+#define per_cpu_sum(var)                                               \
+({                                                                     \
+       typeof(var) __sum = 0;                                          \
+       int cpu;                                                        \
+       compiletime_assert_atomic_type(__sum);                          \
+       for_each_possible_cpu(cpu)                                      \
+               __sum += per_cpu(var, cpu);                             \
+       __sum;                                                          \
+})
 
-static int clear_fast_ctr(struct percpu_rw_semaphore *brw)
+/*
+ * Return true if the modular sum of the sem->read_count per-CPU variable is
+ * zero.  If this sum is zero, then it is stable due to the fact that if any
+ * newly arriving readers increment a given counter, they will immediately
+ * decrement that same counter.
+ */
+static bool readers_active_check(struct percpu_rw_semaphore *sem)
 {
-       unsigned int sum = 0;
-       int cpu;
+       if (per_cpu_sum(*sem->read_count) != 0)
+               return false;
+
+       /*
+        * If we observed the decrement; ensure we see the entire critical
+        * section.
+        */
 
-       for_each_possible_cpu(cpu) {
-               sum += per_cpu(*brw->fast_read_ctr, cpu);
-               per_cpu(*brw->fast_read_ctr, cpu) = 0;
-       }
+       smp_mb(); /* C matches B */
 
-       return sum;
+       return true;
 }
 
-void percpu_down_write(struct percpu_rw_semaphore *brw)
+void percpu_down_write(struct percpu_rw_semaphore *sem)
 {
+       /* Notify readers to take the slow path. */
+       rcu_sync_enter(&sem->rss);
+
+       down_write(&sem->rw_sem);
+
        /*
-        * Make rcu_sync_is_idle() == F and thus disable the fast-path in
-        * percpu_down_read() and percpu_up_read(), and wait for gp pass.
-        *
-        * The latter synchronises us with the preceding readers which used
-        * the fast-past, so we can not miss the result of __this_cpu_add()
-        * or anything else inside their criticial sections.
+        * Notify new readers to block; up until now, and thus throughout the
+        * longish rcu_sync_enter() above, new readers could still come in.
         */
-       rcu_sync_enter(&brw->rss);
+       WRITE_ONCE(sem->readers_block, 1);
 
-       /* exclude other writers, and block the new readers completely */
-       down_write(&brw->rw_sem);
+       smp_mb(); /* D matches A */
 
-       /* nobody can use fast_read_ctr, move its sum into slow_read_ctr */
-       atomic_add(clear_fast_ctr(brw), &brw->slow_read_ctr);
+       /*
+        * If they don't see our writer of readers_block, then we are
+        * guaranteed to see their sem->read_count increment, and therefore
+        * will wait for them.
+        */
 
-       /* wait for all readers to complete their percpu_up_read() */
-       wait_event(brw->write_waitq, !atomic_read(&brw->slow_read_ctr));
+       /* Wait for all now active readers to complete. */
+       wait_event(sem->writer, readers_active_check(sem));
 }
 EXPORT_SYMBOL_GPL(percpu_down_write);
 
-void percpu_up_write(struct percpu_rw_semaphore *brw)
+void percpu_up_write(struct percpu_rw_semaphore *sem)
 {
-       /* release the lock, but the readers can't use the fast-path */
-       up_write(&brw->rw_sem);
        /*
-        * Enable the fast-path in percpu_down_read() and percpu_up_read()
-        * but only after another gp pass; this adds the necessary barrier
-        * to ensure the reader can't miss the changes done by us.
+        * Signal the writer is done, no fast path yet.
+        *
+        * One reason that we cannot just immediately flip to readers_fast is
+        * that new readers might fail to see the results of this writer's
+        * critical section.
+        *
+        * Therefore we force it through the slow path which guarantees an
+        * acquire and thereby guarantees the critical section's consistency.
+        */
+       smp_store_release(&sem->readers_block, 0);
+
+       /*
+        * Release the write lock, this will allow readers back in the game.
+        */
+       up_write(&sem->rw_sem);
+
+       /*
+        * Once this completes (at least one RCU-sched grace period hence) the
+        * reader fast path will be available again. Safe to use outside the
+        * exclusive write lock because its counting.
         */
-       rcu_sync_exit(&brw->rss);
+       rcu_sync_exit(&sem->rss);
 }
 EXPORT_SYMBOL_GPL(percpu_up_write);
index 8a99abf58080be21fbb954777b48aca24d4342b5..3acf16d79cf46f2c935ccc77307900c66d4d344f 100644 (file)
@@ -70,11 +70,14 @@ struct pv_node {
 static inline bool pv_queued_spin_steal_lock(struct qspinlock *lock)
 {
        struct __qspinlock *l = (void *)lock;
-       int ret = !(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) &&
-                  (cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0);
 
-       qstat_inc(qstat_pv_lock_stealing, ret);
-       return ret;
+       if (!(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) &&
+           (cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0)) {
+               qstat_inc(qstat_pv_lock_stealing, true);
+               return true;
+       }
+
+       return false;
 }
 
 /*
@@ -257,7 +260,6 @@ static struct pv_node *pv_unhash(struct qspinlock *lock)
 static inline bool
 pv_wait_early(struct pv_node *prev, int loop)
 {
-
        if ((loop & PV_PREV_CHECK_MASK) != 0)
                return false;
 
@@ -286,12 +288,10 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
 {
        struct pv_node *pn = (struct pv_node *)node;
        struct pv_node *pp = (struct pv_node *)prev;
-       int waitcnt = 0;
        int loop;
        bool wait_early;
 
-       /* waitcnt processing will be compiled out if !QUEUED_LOCK_STAT */
-       for (;; waitcnt++) {
+       for (;;) {
                for (wait_early = false, loop = SPIN_THRESHOLD; loop; loop--) {
                        if (READ_ONCE(node->locked))
                                return;
@@ -315,7 +315,6 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
 
                if (!READ_ONCE(node->locked)) {
                        qstat_inc(qstat_pv_wait_node, true);
-                       qstat_inc(qstat_pv_wait_again, waitcnt);
                        qstat_inc(qstat_pv_wait_early, wait_early);
                        pv_wait(&pn->state, vcpu_halted);
                }
@@ -456,12 +455,9 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
                pv_wait(&l->locked, _Q_SLOW_VAL);
 
                /*
-                * The unlocker should have freed the lock before kicking the
-                * CPU. So if the lock is still not free, it is a spurious
-                * wakeup or another vCPU has stolen the lock. The current
-                * vCPU should spin again.
+                * Because of lock stealing, the queue head vCPU may not be
+                * able to acquire the lock before it has to wait again.
                 */
-               qstat_inc(qstat_pv_spurious_wakeup, READ_ONCE(l->locked));
        }
 
        /*
index b9d0315162540d1236e5e1268f184531d8259114..eb0a599fcf58f3d5fa9a72f7dc429bdaa1bfe4a5 100644 (file)
@@ -24,8 +24,8 @@
  *   pv_latency_wake   - average latency (ns) from vCPU kick to wakeup
  *   pv_lock_slowpath  - # of locking operations via the slowpath
  *   pv_lock_stealing  - # of lock stealing operations
- *   pv_spurious_wakeup        - # of spurious wakeups
- *   pv_wait_again     - # of vCPU wait's that happened after a vCPU kick
+ *   pv_spurious_wakeup        - # of spurious wakeups in non-head vCPUs
+ *   pv_wait_again     - # of wait's after a queue head vCPU kick
  *   pv_wait_early     - # of early vCPU wait's
  *   pv_wait_head      - # of vCPU wait's at the queue head
  *   pv_wait_node      - # of vCPU wait's at a non-head queue node
index 447e08de1fabf11b3d2050c5735a0ea331362fc4..2337b4bb2366df1c2e21ada2ddb5d4b024aae890 100644 (file)
@@ -121,16 +121,19 @@ enum rwsem_wake_type {
  * - woken process blocks are discarded from the list after having task zeroed
  * - writers are only marked woken if downgrading is false
  */
-static struct rw_semaphore *
-__rwsem_mark_wake(struct rw_semaphore *sem,
-                 enum rwsem_wake_type wake_type, struct wake_q_head *wake_q)
+static void __rwsem_mark_wake(struct rw_semaphore *sem,
+                             enum rwsem_wake_type wake_type,
+                             struct wake_q_head *wake_q)
 {
-       struct rwsem_waiter *waiter;
-       struct task_struct *tsk;
-       struct list_head *next;
-       long oldcount, woken, loop, adjustment;
+       struct rwsem_waiter *waiter, *tmp;
+       long oldcount, woken = 0, adjustment = 0;
+
+       /*
+        * Take a peek at the queue head waiter such that we can determine
+        * the wakeup(s) to perform.
+        */
+       waiter = list_first_entry(&sem->wait_list, struct rwsem_waiter, list);
 
-       waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
        if (waiter->type == RWSEM_WAITING_FOR_WRITE) {
                if (wake_type == RWSEM_WAKE_ANY) {
                        /*
@@ -142,19 +145,19 @@ __rwsem_mark_wake(struct rw_semaphore *sem,
                         */
                        wake_q_add(wake_q, waiter->task);
                }
-               goto out;
+
+               return;
        }
 
-       /* Writers might steal the lock before we grant it to the next reader.
+       /*
+        * Writers might steal the lock before we grant it to the next reader.
         * We prefer to do the first reader grant before counting readers
         * so we can bail out early if a writer stole the lock.
         */
-       adjustment = 0;
        if (wake_type != RWSEM_WAKE_READ_OWNED) {
                adjustment = RWSEM_ACTIVE_READ_BIAS;
  try_reader_grant:
                oldcount = atomic_long_fetch_add(adjustment, &sem->count);
-
                if (unlikely(oldcount < RWSEM_WAITING_BIAS)) {
                        /*
                         * If the count is still less than RWSEM_WAITING_BIAS
@@ -164,7 +167,8 @@ __rwsem_mark_wake(struct rw_semaphore *sem,
                         */
                        if (atomic_long_add_return(-adjustment, &sem->count) <
                            RWSEM_WAITING_BIAS)
-                               goto out;
+                               return;
+
                        /* Last active locker left. Retry waking readers. */
                        goto try_reader_grant;
                }
@@ -176,38 +180,23 @@ __rwsem_mark_wake(struct rw_semaphore *sem,
                rwsem_set_reader_owned(sem);
        }
 
-       /* Grant an infinite number of read locks to the readers at the front
-        * of the queue.  Note we increment the 'active part' of the count by
-        * the number of readers before waking any processes up.
+       /*
+        * Grant an infinite number of read locks to the readers at the front
+        * of the queue. We know that woken will be at least 1 as we accounted
+        * for above. Note we increment the 'active part' of the count by the
+        * number of readers before waking any processes up.
         */
-       woken = 0;
-       do {
-               woken++;
+       list_for_each_entry_safe(waiter, tmp, &sem->wait_list, list) {
+               struct task_struct *tsk;
 
-               if (waiter->list.next == &sem->wait_list)
+               if (waiter->type == RWSEM_WAITING_FOR_WRITE)
                        break;
 
-               waiter = list_entry(waiter->list.next,
-                                       struct rwsem_waiter, list);
-
-       } while (waiter->type != RWSEM_WAITING_FOR_WRITE);
-
-       adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment;
-       if (waiter->type != RWSEM_WAITING_FOR_WRITE)
-               /* hit end of list above */
-               adjustment -= RWSEM_WAITING_BIAS;
-
-       if (adjustment)
-               atomic_long_add(adjustment, &sem->count);
-
-       next = sem->wait_list.next;
-       loop = woken;
-       do {
-               waiter = list_entry(next, struct rwsem_waiter, list);
-               next = waiter->list.next;
+               woken++;
                tsk = waiter->task;
 
                wake_q_add(wake_q, tsk);
+               list_del(&waiter->list);
                /*
                 * Ensure that the last operation is setting the reader
                 * waiter to nil such that rwsem_down_read_failed() cannot
@@ -215,13 +204,16 @@ __rwsem_mark_wake(struct rw_semaphore *sem,
                 * to the task to wakeup.
                 */
                smp_store_release(&waiter->task, NULL);
-       } while (--loop);
+       }
 
-       sem->wait_list.next = next;
-       next->prev = &sem->wait_list;
+       adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment;
+       if (list_empty(&sem->wait_list)) {
+               /* hit end of list above */
+               adjustment -= RWSEM_WAITING_BIAS;
+       }
 
- out:
-       return sem;
+       if (adjustment)
+               atomic_long_add(adjustment, &sem->count);
 }
 
 /*
@@ -235,7 +227,6 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
        struct task_struct *tsk = current;
        WAKE_Q(wake_q);
 
-       /* set up my own style of waitqueue */
        waiter.task = tsk;
        waiter.type = RWSEM_WAITING_FOR_READ;
 
@@ -247,7 +238,8 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
        /* we're now waiting on the lock, but no longer actively locking */
        count = atomic_long_add_return(adjustment, &sem->count);
 
-       /* If there are no active locks, wake the front queued process(es).
+       /*
+        * If there are no active locks, wake the front queued process(es).
         *
         * If there are no writers and we are first in the queue,
         * wake our own waiter to join the existing active readers !
@@ -255,7 +247,7 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
        if (count == RWSEM_WAITING_BIAS ||
            (count > RWSEM_WAITING_BIAS &&
             adjustment != -RWSEM_ACTIVE_READ_BIAS))
-               sem = __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
+               __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
 
        raw_spin_unlock_irq(&sem->wait_lock);
        wake_up_q(&wake_q);
@@ -505,7 +497,7 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state)
                if (count > RWSEM_WAITING_BIAS) {
                        WAKE_Q(wake_q);
 
-                       sem = __rwsem_mark_wake(sem, RWSEM_WAKE_READERS, &wake_q);
+                       __rwsem_mark_wake(sem, RWSEM_WAKE_READERS, &wake_q);
                        /*
                         * The wakeup is normally called _after_ the wait_lock
                         * is released, but given that we are proactively waking
@@ -614,9 +606,8 @@ struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
        raw_spin_lock_irqsave(&sem->wait_lock, flags);
 locked:
 
-       /* do nothing if list empty */
        if (!list_empty(&sem->wait_list))
-               sem = __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
+               __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
 
        raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
        wake_up_q(&wake_q);
@@ -638,9 +629,8 @@ struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem)
 
        raw_spin_lock_irqsave(&sem->wait_lock, flags);
 
-       /* do nothing if list empty */
        if (!list_empty(&sem->wait_list))
-               sem = __rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q);
+               __rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q);
 
        raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
        wake_up_q(&wake_q);
index be922c9f3d37256fc060d5b9ba0aaf6a2d1f085c..50d1861f7759b40ff0248281e742a6a4fac1e852 100644 (file)
@@ -68,6 +68,8 @@ void rcu_sync_lockdep_assert(struct rcu_sync *rsp)
        RCU_LOCKDEP_WARN(!gp_ops[rsp->gp_type].held(),
                         "suspicious rcu_sync_is_idle() usage");
 }
+
+EXPORT_SYMBOL_GPL(rcu_sync_lockdep_assert);
 #endif
 
 /**
@@ -82,6 +84,18 @@ void rcu_sync_init(struct rcu_sync *rsp, enum rcu_sync_type type)
        rsp->gp_type = type;
 }
 
+/**
+ * Must be called after rcu_sync_init() and before first use.
+ *
+ * Ensures rcu_sync_is_idle() returns false and rcu_sync_{enter,exit}()
+ * pairs turn into NO-OPs.
+ */
+void rcu_sync_enter_start(struct rcu_sync *rsp)
+{
+       rsp->gp_count++;
+       rsp->gp_state = GP_PASSED;
+}
+
 /**
  * rcu_sync_enter() - Force readers onto slowpath
  * @rsp: Pointer to rcu_sync structure to use for synchronization
index d797502140b9a0f1ceb36f5ccbb4b9c66382a44e..fc9b4a4af463a61732e185cceda2afffc827fd6e 100644 (file)
@@ -214,7 +214,7 @@ static void relay_destroy_buf(struct rchan_buf *buf)
                        __free_page(buf->page_array[i]);
                relay_free_page_array(buf->page_array);
        }
-       chan->buf[buf->cpu] = NULL;
+       *per_cpu_ptr(chan->buf, buf->cpu) = NULL;
        kfree(buf->padding);
        kfree(buf);
        kref_put(&chan->kref, relay_destroy_channel);
@@ -382,20 +382,21 @@ static void __relay_reset(struct rchan_buf *buf, unsigned int init)
  */
 void relay_reset(struct rchan *chan)
 {
+       struct rchan_buf *buf;
        unsigned int i;
 
        if (!chan)
                return;
 
-       if (chan->is_global && chan->buf[0]) {
-               __relay_reset(chan->buf[0], 0);
+       if (chan->is_global && (buf = *per_cpu_ptr(chan->buf, 0))) {
+               __relay_reset(buf, 0);
                return;
        }
 
        mutex_lock(&relay_channels_mutex);
        for_each_possible_cpu(i)
-               if (chan->buf[i])
-                       __relay_reset(chan->buf[i], 0);
+               if ((buf = *per_cpu_ptr(chan->buf, i)))
+                       __relay_reset(buf, 0);
        mutex_unlock(&relay_channels_mutex);
 }
 EXPORT_SYMBOL_GPL(relay_reset);
@@ -440,7 +441,7 @@ static struct rchan_buf *relay_open_buf(struct rchan *chan, unsigned int cpu)
        struct dentry *dentry;
 
        if (chan->is_global)
-               return chan->buf[0];
+               return *per_cpu_ptr(chan->buf, 0);
 
        buf = relay_create_buf(chan);
        if (!buf)
@@ -464,7 +465,7 @@ static struct rchan_buf *relay_open_buf(struct rchan *chan, unsigned int cpu)
        __relay_reset(buf, 1);
 
        if(chan->is_global) {
-               chan->buf[0] = buf;
+               *per_cpu_ptr(chan->buf, 0) = buf;
                buf->cpu = 0;
        }
 
@@ -512,46 +513,25 @@ static void setup_callbacks(struct rchan *chan,
        chan->cb = cb;
 }
 
-/**
- *     relay_hotcpu_callback - CPU hotplug callback
- *     @nb: notifier block
- *     @action: hotplug action to take
- *     @hcpu: CPU number
- *
- *     Returns the success/failure of the operation. (%NOTIFY_OK, %NOTIFY_BAD)
- */
-static int relay_hotcpu_callback(struct notifier_block *nb,
-                               unsigned long action,
-                               void *hcpu)
+int relay_prepare_cpu(unsigned int cpu)
 {
-       unsigned int hotcpu = (unsigned long)hcpu;
        struct rchan *chan;
+       struct rchan_buf *buf;
 
-       switch(action) {
-       case CPU_UP_PREPARE:
-       case CPU_UP_PREPARE_FROZEN:
-               mutex_lock(&relay_channels_mutex);
-               list_for_each_entry(chan, &relay_channels, list) {
-                       if (chan->buf[hotcpu])
-                               continue;
-                       chan->buf[hotcpu] = relay_open_buf(chan, hotcpu);
-                       if(!chan->buf[hotcpu]) {
-                               printk(KERN_ERR
-                                       "relay_hotcpu_callback: cpu %d buffer "
-                                       "creation failed\n", hotcpu);
-                               mutex_unlock(&relay_channels_mutex);
-                               return notifier_from_errno(-ENOMEM);
-                       }
+       mutex_lock(&relay_channels_mutex);
+       list_for_each_entry(chan, &relay_channels, list) {
+               if ((buf = *per_cpu_ptr(chan->buf, cpu)))
+                       continue;
+               buf = relay_open_buf(chan, cpu);
+               if (!buf) {
+                       pr_err("relay: cpu %d buffer creation failed\n", cpu);
+                       mutex_unlock(&relay_channels_mutex);
+                       return -ENOMEM;
                }
-               mutex_unlock(&relay_channels_mutex);
-               break;
-       case CPU_DEAD:
-       case CPU_DEAD_FROZEN:
-               /* No need to flush the cpu : will be flushed upon
-                * final relay_flush() call. */
-               break;
+               *per_cpu_ptr(chan->buf, cpu) = buf;
        }
-       return NOTIFY_OK;
+       mutex_unlock(&relay_channels_mutex);
+       return 0;
 }
 
 /**
@@ -583,6 +563,7 @@ struct rchan *relay_open(const char *base_filename,
 {
        unsigned int i;
        struct rchan *chan;
+       struct rchan_buf *buf;
 
        if (!(subbuf_size && n_subbufs))
                return NULL;
@@ -593,6 +574,7 @@ struct rchan *relay_open(const char *base_filename,
        if (!chan)
                return NULL;
 
+       chan->buf = alloc_percpu(struct rchan_buf *);
        chan->version = RELAYFS_CHANNEL_VERSION;
        chan->n_subbufs = n_subbufs;
        chan->subbuf_size = subbuf_size;
@@ -608,9 +590,10 @@ struct rchan *relay_open(const char *base_filename,
 
        mutex_lock(&relay_channels_mutex);
        for_each_online_cpu(i) {
-               chan->buf[i] = relay_open_buf(chan, i);
-               if (!chan->buf[i])
+               buf = relay_open_buf(chan, i);
+               if (!buf)
                        goto free_bufs;
+               *per_cpu_ptr(chan->buf, i) = buf;
        }
        list_add(&chan->list, &relay_channels);
        mutex_unlock(&relay_channels_mutex);
@@ -619,8 +602,8 @@ struct rchan *relay_open(const char *base_filename,
 
 free_bufs:
        for_each_possible_cpu(i) {
-               if (chan->buf[i])
-                       relay_close_buf(chan->buf[i]);
+               if ((buf = *per_cpu_ptr(chan->buf, i)))
+                       relay_close_buf(buf);
        }
 
        kref_put(&chan->kref, relay_destroy_channel);
@@ -666,6 +649,7 @@ int relay_late_setup_files(struct rchan *chan,
        unsigned int i, curr_cpu;
        unsigned long flags;
        struct dentry *dentry;
+       struct rchan_buf *buf;
        struct rchan_percpu_buf_dispatcher disp;
 
        if (!chan || !base_filename)
@@ -684,10 +668,11 @@ int relay_late_setup_files(struct rchan *chan,
 
        if (chan->is_global) {
                err = -EINVAL;
-               if (!WARN_ON_ONCE(!chan->buf[0])) {
-                       dentry = relay_create_buf_file(chan, chan->buf[0], 0);
+               buf = *per_cpu_ptr(chan->buf, 0);
+               if (!WARN_ON_ONCE(!buf)) {
+                       dentry = relay_create_buf_file(chan, buf, 0);
                        if (dentry && !WARN_ON_ONCE(!chan->is_global)) {
-                               relay_set_buf_dentry(chan->buf[0], dentry);
+                               relay_set_buf_dentry(buf, dentry);
                                err = 0;
                        }
                }
@@ -702,13 +687,14 @@ int relay_late_setup_files(struct rchan *chan,
         * on all currently online CPUs.
         */
        for_each_online_cpu(i) {
-               if (unlikely(!chan->buf[i])) {
+               buf = *per_cpu_ptr(chan->buf, i);
+               if (unlikely(!buf)) {
                        WARN_ONCE(1, KERN_ERR "CPU has no buffer!\n");
                        err = -EINVAL;
                        break;
                }
 
-               dentry = relay_create_buf_file(chan, chan->buf[i], i);
+               dentry = relay_create_buf_file(chan, buf, i);
                if (unlikely(!dentry)) {
                        err = -EINVAL;
                        break;
@@ -716,10 +702,10 @@ int relay_late_setup_files(struct rchan *chan,
 
                if (curr_cpu == i) {
                        local_irq_save(flags);
-                       relay_set_buf_dentry(chan->buf[i], dentry);
+                       relay_set_buf_dentry(buf, dentry);
                        local_irq_restore(flags);
                } else {
-                       disp.buf = chan->buf[i];
+                       disp.buf = buf;
                        disp.dentry = dentry;
                        smp_mb();
                        /* relay_channels_mutex must be held, so wait. */
@@ -822,11 +808,10 @@ void relay_subbufs_consumed(struct rchan *chan,
        if (!chan)
                return;
 
-       if (cpu >= NR_CPUS || !chan->buf[cpu] ||
-                                       subbufs_consumed > chan->n_subbufs)
+       buf = *per_cpu_ptr(chan->buf, cpu);
+       if (cpu >= NR_CPUS || !buf || subbufs_consumed > chan->n_subbufs)
                return;
 
-       buf = chan->buf[cpu];
        if (subbufs_consumed > buf->subbufs_produced - buf->subbufs_consumed)
                buf->subbufs_consumed = buf->subbufs_produced;
        else
@@ -842,18 +827,19 @@ EXPORT_SYMBOL_GPL(relay_subbufs_consumed);
  */
 void relay_close(struct rchan *chan)
 {
+       struct rchan_buf *buf;
        unsigned int i;
 
        if (!chan)
                return;
 
        mutex_lock(&relay_channels_mutex);
-       if (chan->is_global && chan->buf[0])
-               relay_close_buf(chan->buf[0]);
+       if (chan->is_global && (buf = *per_cpu_ptr(chan->buf, 0)))
+               relay_close_buf(buf);
        else
                for_each_possible_cpu(i)
-                       if (chan->buf[i])
-                               relay_close_buf(chan->buf[i]);
+                       if ((buf = *per_cpu_ptr(chan->buf, i)))
+                               relay_close_buf(buf);
 
        if (chan->last_toobig)
                printk(KERN_WARNING "relay: one or more items not logged "
@@ -874,20 +860,21 @@ EXPORT_SYMBOL_GPL(relay_close);
  */
 void relay_flush(struct rchan *chan)
 {
+       struct rchan_buf *buf;
        unsigned int i;
 
        if (!chan)
                return;
 
-       if (chan->is_global && chan->buf[0]) {
-               relay_switch_subbuf(chan->buf[0], 0);
+       if (chan->is_global && (buf = *per_cpu_ptr(chan->buf, 0))) {
+               relay_switch_subbuf(buf, 0);
                return;
        }
 
        mutex_lock(&relay_channels_mutex);
        for_each_possible_cpu(i)
-               if (chan->buf[i])
-                       relay_switch_subbuf(chan->buf[i], 0);
+               if ((buf = *per_cpu_ptr(chan->buf, i)))
+                       relay_switch_subbuf(buf, 0);
        mutex_unlock(&relay_channels_mutex);
 }
 EXPORT_SYMBOL_GPL(relay_flush);
@@ -1377,12 +1364,3 @@ const struct file_operations relay_file_operations = {
        .splice_read    = relay_file_splice_read,
 };
 EXPORT_SYMBOL_GPL(relay_file_operations);
-
-static __init int relay_init(void)
-{
-
-       hotcpu_notifier(relay_hotcpu_callback, 0);
-       return 0;
-}
-
-early_initcall(relay_init);
index 2a906f20fba7c4ba63d89fe87ac3527607be453a..a0086a5fc00893d63f1b0c42ac85f63508d07a2d 100644 (file)
@@ -1265,7 +1265,7 @@ static void __migrate_swap_task(struct task_struct *p, int cpu)
                /*
                 * Task isn't running anymore; make it appear like we migrated
                 * it before it went to sleep. This means on wakeup we make the
-                * previous cpu our targer instead of where it really is.
+                * previous cpu our target instead of where it really is.
                 */
                p->wake_cpu = cpu;
        }
@@ -1629,23 +1629,25 @@ static inline int __set_cpus_allowed_ptr(struct task_struct *p,
 static void
 ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
 {
-#ifdef CONFIG_SCHEDSTATS
-       struct rq *rq = this_rq();
+       struct rq *rq;
 
-#ifdef CONFIG_SMP
-       int this_cpu = smp_processor_id();
+       if (!schedstat_enabled())
+               return;
 
-       if (cpu == this_cpu) {
-               schedstat_inc(rq, ttwu_local);
-               schedstat_inc(p, se.statistics.nr_wakeups_local);
+       rq = this_rq();
+
+#ifdef CONFIG_SMP
+       if (cpu == rq->cpu) {
+               schedstat_inc(rq->ttwu_local);
+               schedstat_inc(p->se.statistics.nr_wakeups_local);
        } else {
                struct sched_domain *sd;
 
-               schedstat_inc(pse.statistics.nr_wakeups_remote);
+               schedstat_inc(p->se.statistics.nr_wakeups_remote);
                rcu_read_lock();
-               for_each_domain(this_cpu, sd) {
+               for_each_domain(rq->cpu, sd) {
                        if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
-                               schedstat_inc(sdttwu_wake_remote);
+                               schedstat_inc(sd->ttwu_wake_remote);
                                break;
                        }
                }
@@ -1653,17 +1655,14 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
        }
 
        if (wake_flags & WF_MIGRATED)
-               schedstat_inc(p, se.statistics.nr_wakeups_migrate);
-
+               schedstat_inc(p->se.statistics.nr_wakeups_migrate);
 #endif /* CONFIG_SMP */
 
-       schedstat_inc(rqttwu_count);
-       schedstat_inc(pse.statistics.nr_wakeups);
+       schedstat_inc(rq->ttwu_count);
+       schedstat_inc(p->se.statistics.nr_wakeups);
 
        if (wake_flags & WF_SYNC)
-               schedstat_inc(p, se.statistics.nr_wakeups_sync);
-
-#endif /* CONFIG_SCHEDSTATS */
+               schedstat_inc(p->se.statistics.nr_wakeups_sync);
 }
 
 static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
@@ -2016,6 +2015,28 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
        success = 1; /* we're going to change ->state */
        cpu = task_cpu(p);
 
+       /*
+        * Ensure we load p->on_rq _after_ p->state, otherwise it would
+        * be possible to, falsely, observe p->on_rq == 0 and get stuck
+        * in smp_cond_load_acquire() below.
+        *
+        * sched_ttwu_pending()                 try_to_wake_up()
+        *   [S] p->on_rq = 1;                  [L] P->state
+        *       UNLOCK rq->lock  -----.
+        *                              \
+        *                               +---   RMB
+        * schedule()                   /
+        *       LOCK rq->lock    -----'
+        *       UNLOCK rq->lock
+        *
+        * [task p]
+        *   [S] p->state = UNINTERRUPTIBLE     [L] p->on_rq
+        *
+        * Pairs with the UNLOCK+LOCK on rq->lock from the
+        * last wakeup of our task and the schedule that got our task
+        * current.
+        */
+       smp_rmb();
        if (p->on_rq && ttwu_remote(p, wake_flags))
                goto stat;
 
@@ -2062,8 +2083,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 
        ttwu_queue(p, cpu, wake_flags);
 stat:
-       if (schedstat_enabled())
-               ttwu_stat(p, cpu, wake_flags);
+       ttwu_stat(p, cpu, wake_flags);
 out:
        raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 
@@ -2073,6 +2093,7 @@ out:
 /**
  * try_to_wake_up_local - try to wake up a local task with rq lock held
  * @p: the thread to be awakened
+ * @cookie: context's cookie for pinning
  *
  * Put @p on the run-queue if it's not already there. The caller must
  * ensure that this_rq() is locked, @p is bound to this_rq() and not
@@ -2111,8 +2132,7 @@ static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie
                ttwu_activate(rq, p, ENQUEUE_WAKEUP);
 
        ttwu_do_wakeup(rq, p, 0, cookie);
-       if (schedstat_enabled())
-               ttwu_stat(p, smp_processor_id(), 0);
+       ttwu_stat(p, smp_processor_id(), 0);
 out:
        raw_spin_unlock(&p->pi_lock);
 }
@@ -3170,6 +3190,9 @@ static inline void preempt_latency_stop(int val) { }
  */
 static noinline void __schedule_bug(struct task_struct *prev)
 {
+       /* Save this before calling printk(), since that will clobber it */
+       unsigned long preempt_disable_ip = get_preempt_disable_ip(current);
+
        if (oops_in_progress)
                return;
 
@@ -3180,13 +3203,12 @@ static noinline void __schedule_bug(struct task_struct *prev)
        print_modules();
        if (irqs_disabled())
                print_irqtrace_events(prev);
-#ifdef CONFIG_DEBUG_PREEMPT
-       if (in_atomic_preempt_off()) {
+       if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
+           && in_atomic_preempt_off()) {
                pr_err("Preemption disabled at:");
-               print_ip_sym(current->preempt_disable_ip);
+               print_ip_sym(preempt_disable_ip);
                pr_cont("\n");
        }
-#endif
        if (panic_on_warn)
                panic("scheduling while atomic\n");
 
@@ -3212,7 +3234,7 @@ static inline void schedule_debug(struct task_struct *prev)
 
        profile_hit(SCHED_PROFILING, __builtin_return_address(0));
 
-       schedstat_inc(this_rq()sched_count);
+       schedstat_inc(this_rq()->sched_count);
 }
 
 /*
@@ -3381,7 +3403,6 @@ static void __sched notrace __schedule(bool preempt)
 
        balance_callback(rq);
 }
-STACK_FRAME_NON_STANDARD(__schedule); /* switch_to() */
 
 static inline void sched_submit_work(struct task_struct *tsk)
 {
@@ -4824,7 +4845,7 @@ SYSCALL_DEFINE0(sched_yield)
 {
        struct rq *rq = this_rq_lock();
 
-       schedstat_inc(rqyld_count);
+       schedstat_inc(rq->yld_count);
        current->sched_class->yield_task(rq);
 
        /*
@@ -4975,7 +4996,7 @@ again:
 
        yielded = curr->sched_class->yield_to_task(rq, p, preempt);
        if (yielded) {
-               schedstat_inc(rqyld_count);
+               schedstat_inc(rq->yld_count);
                /*
                 * Make p's CPU reschedule; pick_next_entity takes care of
                 * fairness.
@@ -5713,6 +5734,7 @@ static int sd_degenerate(struct sched_domain *sd)
                         SD_BALANCE_FORK |
                         SD_BALANCE_EXEC |
                         SD_SHARE_CPUCAPACITY |
+                        SD_ASYM_CPUCAPACITY |
                         SD_SHARE_PKG_RESOURCES |
                         SD_SHARE_POWERDOMAIN)) {
                if (sd->groups != sd->groups->next)
@@ -5743,6 +5765,7 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
                                SD_BALANCE_NEWIDLE |
                                SD_BALANCE_FORK |
                                SD_BALANCE_EXEC |
+                               SD_ASYM_CPUCAPACITY |
                                SD_SHARE_CPUCAPACITY |
                                SD_SHARE_PKG_RESOURCES |
                                SD_PREFER_SIBLING |
@@ -6352,23 +6375,32 @@ static int sched_domains_curr_level;
 /*
  * SD_flags allowed in topology descriptions.
  *
- * SD_SHARE_CPUCAPACITY      - describes SMT topologies
- * SD_SHARE_PKG_RESOURCES - describes shared caches
- * SD_NUMA                - describes NUMA topologies
- * SD_SHARE_POWERDOMAIN   - describes shared power domain
+ * These flags are purely descriptive of the topology and do not prescribe
+ * behaviour. Behaviour is artificial and mapped in the below sd_init()
+ * function:
  *
- * Odd one out:
- * SD_ASYM_PACKING        - describes SMT quirks
+ *   SD_SHARE_CPUCAPACITY   - describes SMT topologies
+ *   SD_SHARE_PKG_RESOURCES - describes shared caches
+ *   SD_NUMA                - describes NUMA topologies
+ *   SD_SHARE_POWERDOMAIN   - describes shared power domain
+ *   SD_ASYM_CPUCAPACITY    - describes mixed capacity topologies
+ *
+ * Odd one out, which beside describing the topology has a quirk also
+ * prescribes the desired behaviour that goes along with it:
+ *
+ *   SD_ASYM_PACKING        - describes SMT quirks
  */
 #define TOPOLOGY_SD_FLAGS              \
        (SD_SHARE_CPUCAPACITY |         \
         SD_SHARE_PKG_RESOURCES |       \
         SD_NUMA |                      \
         SD_ASYM_PACKING |              \
+        SD_ASYM_CPUCAPACITY |          \
         SD_SHARE_POWERDOMAIN)
 
 static struct sched_domain *
-sd_init(struct sched_domain_topology_level *tl, int cpu)
+sd_init(struct sched_domain_topology_level *tl,
+       struct sched_domain *child, int cpu)
 {
        struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
        int sd_weight, sd_flags = 0;
@@ -6420,6 +6452,7 @@ sd_init(struct sched_domain_topology_level *tl, int cpu)
                .smt_gain               = 0,
                .max_newidle_lb_cost    = 0,
                .next_decay_max_lb_cost = jiffies,
+               .child                  = child,
 #ifdef CONFIG_SCHED_DEBUG
                .name                   = tl->name,
 #endif
@@ -6429,6 +6462,13 @@ sd_init(struct sched_domain_topology_level *tl, int cpu)
         * Convert topological properties into behaviour.
         */
 
+       if (sd->flags & SD_ASYM_CPUCAPACITY) {
+               struct sched_domain *t = sd;
+
+               for_each_lower_domain(t)
+                       t->flags |= SD_BALANCE_WAKE;
+       }
+
        if (sd->flags & SD_SHARE_CPUCAPACITY) {
                sd->flags |= SD_PREFER_SIBLING;
                sd->imbalance_pct = 110;
@@ -6844,16 +6884,13 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
                const struct cpumask *cpu_map, struct sched_domain_attr *attr,
                struct sched_domain *child, int cpu)
 {
-       struct sched_domain *sd = sd_init(tl, cpu);
-       if (!sd)
-               return child;
+       struct sched_domain *sd = sd_init(tl, child, cpu);
 
        cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
        if (child) {
                sd->level = child->level + 1;
                sched_domain_level_max = max(sched_domain_level_max, sd->level);
                child->parent = sd;
-               sd->child = child;
 
                if (!cpumask_subset(sched_domain_span(child),
                                    sched_domain_span(sd))) {
@@ -6884,6 +6921,7 @@ static int build_sched_domains(const struct cpumask *cpu_map,
        enum s_alloc alloc_state;
        struct sched_domain *sd;
        struct s_data d;
+       struct rq *rq = NULL;
        int i, ret = -ENOMEM;
 
        alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
@@ -6934,11 +6972,22 @@ static int build_sched_domains(const struct cpumask *cpu_map,
        /* Attach the domains */
        rcu_read_lock();
        for_each_cpu(i, cpu_map) {
+               rq = cpu_rq(i);
                sd = *per_cpu_ptr(d.sd, i);
+
+               /* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */
+               if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity))
+                       WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig);
+
                cpu_attach_domain(sd, d.rd, i);
        }
        rcu_read_unlock();
 
+       if (rq) {
+               pr_info("span: %*pbl (max cpu_capacity = %lu)\n",
+                       cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity);
+       }
+
        ret = 0;
 error:
        __free_domain_allocs(&d, alloc_state, cpu_map);
@@ -7501,10 +7550,6 @@ void __init sched_init(void)
 
        set_load_weight(&init_task);
 
-#ifdef CONFIG_PREEMPT_NOTIFIERS
-       INIT_HLIST_HEAD(&init_task.preempt_notifiers);
-#endif
-
        /*
         * The boot idle thread does lazy MMU switching as well:
         */
@@ -7570,6 +7615,7 @@ EXPORT_SYMBOL(__might_sleep);
 void ___might_sleep(const char *file, int line, int preempt_offset)
 {
        static unsigned long prev_jiffy;        /* ratelimiting */
+       unsigned long preempt_disable_ip;
 
        rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
        if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
@@ -7580,6 +7626,9 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
                return;
        prev_jiffy = jiffies;
 
+       /* Save this before calling printk(), since that will clobber it */
+       preempt_disable_ip = get_preempt_disable_ip(current);
+
        printk(KERN_ERR
                "BUG: sleeping function called from invalid context at %s:%d\n",
                        file, line);
@@ -7594,14 +7643,14 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
        debug_show_held_locks(current);
        if (irqs_disabled())
                print_irqtrace_events(current);
-#ifdef CONFIG_DEBUG_PREEMPT
-       if (!preempt_count_equals(preempt_offset)) {
+       if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
+           && !preempt_count_equals(preempt_offset)) {
                pr_err("Preemption disabled at:");
-               print_ip_sym(current->preempt_disable_ip);
+               print_ip_sym(preempt_disable_ip);
                pr_cont("\n");
        }
-#endif
        dump_stack();
+       add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
 }
 EXPORT_SYMBOL(___might_sleep);
 #endif
@@ -7622,12 +7671,10 @@ void normalize_rt_tasks(void)
                if (p->flags & PF_KTHREAD)
                        continue;
 
-               p->se.exec_start                = 0;
-#ifdef CONFIG_SCHEDSTATS
-               p->se.statistics.wait_start     = 0;
-               p->se.statistics.sleep_start    = 0;
-               p->se.statistics.block_start    = 0;
-#endif
+               p->se.exec_start = 0;
+               schedstat_set(p->se.statistics.wait_start,  0);
+               schedstat_set(p->se.statistics.sleep_start, 0);
+               schedstat_set(p->se.statistics.block_start, 0);
 
                if (!dl_task(p) && !rt_task(p)) {
                        /*
index d4184498c9f5e3c8674015f97fe04da2417dafbd..e73119013c5318296256783c7998f1d4b628bb59 100644 (file)
@@ -31,56 +31,81 @@ static inline int right_child(int i)
        return (i << 1) + 2;
 }
 
-static void cpudl_exchange(struct cpudl *cp, int a, int b)
+static void cpudl_heapify_down(struct cpudl *cp, int idx)
 {
-       int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu;
+       int l, r, largest;
 
-       swap(cp->elements[a].cpu, cp->elements[b].cpu);
-       swap(cp->elements[a].dl , cp->elements[b].dl );
+       int orig_cpu = cp->elements[idx].cpu;
+       u64 orig_dl = cp->elements[idx].dl;
 
-       swap(cp->elements[cpu_a].idx, cp->elements[cpu_b].idx);
-}
-
-static void cpudl_heapify(struct cpudl *cp, int idx)
-{
-       int l, r, largest;
+       if (left_child(idx) >= cp->size)
+               return;
 
        /* adapted from lib/prio_heap.c */
        while(1) {
+               u64 largest_dl;
                l = left_child(idx);
                r = right_child(idx);
                largest = idx;
+               largest_dl = orig_dl;
 
-               if ((l < cp->size) && dl_time_before(cp->elements[idx].dl,
-                                                       cp->elements[l].dl))
+               if ((l < cp->size) && dl_time_before(orig_dl,
+                                               cp->elements[l].dl)) {
                        largest = l;
-               if ((r < cp->size) && dl_time_before(cp->elements[largest].dl,
-                                                       cp->elements[r].dl))
+                       largest_dl = cp->elements[l].dl;
+               }
+               if ((r < cp->size) && dl_time_before(largest_dl,
+                                               cp->elements[r].dl))
                        largest = r;
+
                if (largest == idx)
                        break;
 
-               /* Push idx down the heap one level and bump one up */
-               cpudl_exchange(cp, largest, idx);
+               /* pull largest child onto idx */
+               cp->elements[idx].cpu = cp->elements[largest].cpu;
+               cp->elements[idx].dl = cp->elements[largest].dl;
+               cp->elements[cp->elements[idx].cpu].idx = idx;
                idx = largest;
        }
+       /* actual push down of saved original values orig_* */
+       cp->elements[idx].cpu = orig_cpu;
+       cp->elements[idx].dl = orig_dl;
+       cp->elements[cp->elements[idx].cpu].idx = idx;
 }
 
-static void cpudl_change_key(struct cpudl *cp, int idx, u64 new_dl)
+static void cpudl_heapify_up(struct cpudl *cp, int idx)
 {
-       WARN_ON(idx == IDX_INVALID || !cpu_present(idx));
+       int p;
 
-       if (dl_time_before(new_dl, cp->elements[idx].dl)) {
-               cp->elements[idx].dl = new_dl;
-               cpudl_heapify(cp, idx);
-       } else {
-               cp->elements[idx].dl = new_dl;
-               while (idx > 0 && dl_time_before(cp->elements[parent(idx)].dl,
-                                       cp->elements[idx].dl)) {
-                       cpudl_exchange(cp, idx, parent(idx));
-                       idx = parent(idx);
-               }
-       }
+       int orig_cpu = cp->elements[idx].cpu;
+       u64 orig_dl = cp->elements[idx].dl;
+
+       if (idx == 0)
+               return;
+
+       do {
+               p = parent(idx);
+               if (dl_time_before(orig_dl, cp->elements[p].dl))
+                       break;
+               /* pull parent onto idx */
+               cp->elements[idx].cpu = cp->elements[p].cpu;
+               cp->elements[idx].dl = cp->elements[p].dl;
+               cp->elements[cp->elements[idx].cpu].idx = idx;
+               idx = p;
+       } while (idx != 0);
+       /* actual push up of saved original values orig_* */
+       cp->elements[idx].cpu = orig_cpu;
+       cp->elements[idx].dl = orig_dl;
+       cp->elements[cp->elements[idx].cpu].idx = idx;
+}
+
+static void cpudl_heapify(struct cpudl *cp, int idx)
+{
+       if (idx > 0 && dl_time_before(cp->elements[parent(idx)].dl,
+                               cp->elements[idx].dl))
+               cpudl_heapify_up(cp, idx);
+       else
+               cpudl_heapify_down(cp, idx);
 }
 
 static inline int cpudl_maximum(struct cpudl *cp)
@@ -120,16 +145,15 @@ out:
 }
 
 /*
- * cpudl_set - update the cpudl max-heap
+ * cpudl_clear - remove a cpu from the cpudl max-heap
  * @cp: the cpudl max-heap context
  * @cpu: the target cpu
- * @dl: the new earliest deadline for this cpu
  *
  * Notes: assumes cpu_rq(cpu)->lock is locked
  *
  * Returns: (void)
  */
-void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)
+void cpudl_clear(struct cpudl *cp, int cpu)
 {
        int old_idx, new_cpu;
        unsigned long flags;
@@ -137,47 +161,60 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)
        WARN_ON(!cpu_present(cpu));
 
        raw_spin_lock_irqsave(&cp->lock, flags);
+
        old_idx = cp->elements[cpu].idx;
-       if (!is_valid) {
-               /* remove item */
-               if (old_idx == IDX_INVALID) {
-                       /*
-                        * Nothing to remove if old_idx was invalid.
-                        * This could happen if a rq_offline_dl is
-                        * called for a CPU without -dl tasks running.
-                        */
-                       goto out;
-               }
+       if (old_idx == IDX_INVALID) {
+               /*
+                * Nothing to remove if old_idx was invalid.
+                * This could happen if a rq_offline_dl is
+                * called for a CPU without -dl tasks running.
+                */
+       } else {
                new_cpu = cp->elements[cp->size - 1].cpu;
                cp->elements[old_idx].dl = cp->elements[cp->size - 1].dl;
                cp->elements[old_idx].cpu = new_cpu;
                cp->size--;
                cp->elements[new_cpu].idx = old_idx;
                cp->elements[cpu].idx = IDX_INVALID;
-               while (old_idx > 0 && dl_time_before(
-                               cp->elements[parent(old_idx)].dl,
-                               cp->elements[old_idx].dl)) {
-                       cpudl_exchange(cp, old_idx, parent(old_idx));
-                       old_idx = parent(old_idx);
-               }
-               cpumask_set_cpu(cpu, cp->free_cpus);
-                cpudl_heapify(cp, old_idx);
+               cpudl_heapify(cp, old_idx);
 
-               goto out;
+               cpumask_set_cpu(cpu, cp->free_cpus);
        }
+       raw_spin_unlock_irqrestore(&cp->lock, flags);
+}
+
+/*
+ * cpudl_set - update the cpudl max-heap
+ * @cp: the cpudl max-heap context
+ * @cpu: the target cpu
+ * @dl: the new earliest deadline for this cpu
+ *
+ * Notes: assumes cpu_rq(cpu)->lock is locked
+ *
+ * Returns: (void)
+ */
+void cpudl_set(struct cpudl *cp, int cpu, u64 dl)
+{
+       int old_idx;
+       unsigned long flags;
 
+       WARN_ON(!cpu_present(cpu));
+
+       raw_spin_lock_irqsave(&cp->lock, flags);
+
+       old_idx = cp->elements[cpu].idx;
        if (old_idx == IDX_INVALID) {
-               cp->size++;
-               cp->elements[cp->size - 1].dl = dl;
-               cp->elements[cp->size - 1].cpu = cpu;
-               cp->elements[cpu].idx = cp->size - 1;
-               cpudl_change_key(cp, cp->size - 1, dl);
+               int new_idx = cp->size++;
+               cp->elements[new_idx].dl = dl;
+               cp->elements[new_idx].cpu = cpu;
+               cp->elements[cpu].idx = new_idx;
+               cpudl_heapify_up(cp, new_idx);
                cpumask_clear_cpu(cpu, cp->free_cpus);
        } else {
-               cpudl_change_key(cp, old_idx, dl);
+               cp->elements[old_idx].dl = dl;
+               cpudl_heapify(cp, old_idx);
        }
 
-out:
        raw_spin_unlock_irqrestore(&cp->lock, flags);
 }
 
index fcbdf83fed7e31afc4ee916f94d272d03117a2c2..f7da8c55bba03261002c1b8d000233c8dd190a55 100644 (file)
@@ -23,7 +23,8 @@ struct cpudl {
 #ifdef CONFIG_SMP
 int cpudl_find(struct cpudl *cp, struct task_struct *p,
               struct cpumask *later_mask);
-void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid);
+void cpudl_set(struct cpudl *cp, int cpu, u64 dl);
+void cpudl_clear(struct cpudl *cp, int cpu);
 int cpudl_init(struct cpudl *cp);
 void cpudl_set_freecpu(struct cpudl *cp, int cpu);
 void cpudl_clear_freecpu(struct cpudl *cp, int cpu);
index a846cf89eb96182950db834207322c92722d0d2e..b93c72d5f64fc686df20d78889f11c6a01bc920a 100644 (file)
@@ -306,6 +306,26 @@ static inline cputime_t account_other_time(cputime_t max)
        return accounted;
 }
 
+#ifdef CONFIG_64BIT
+static inline u64 read_sum_exec_runtime(struct task_struct *t)
+{
+       return t->se.sum_exec_runtime;
+}
+#else
+static u64 read_sum_exec_runtime(struct task_struct *t)
+{
+       u64 ns;
+       struct rq_flags rf;
+       struct rq *rq;
+
+       rq = task_rq_lock(t, &rf);
+       ns = t->se.sum_exec_runtime;
+       task_rq_unlock(rq, t, &rf);
+
+       return ns;
+}
+#endif
+
 /*
  * Accumulate raw cputime values of dead tasks (sig->[us]time) and live
  * tasks (sum on group iteration) belonging to @tsk's group.
@@ -318,6 +338,17 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
        unsigned int seq, nextseq;
        unsigned long flags;
 
+       /*
+        * Update current task runtime to account pending time since last
+        * scheduler action or thread_group_cputime() call. This thread group
+        * might have other running tasks on different CPUs, but updating
+        * their runtime can affect syscall performance, so we skip account
+        * those pending times and rely only on values updated on tick or
+        * other scheduler action.
+        */
+       if (same_thread_group(current, tsk))
+               (void) task_sched_runtime(current);
+
        rcu_read_lock();
        /* Attempt a lockless read on the first round. */
        nextseq = 0;
@@ -332,7 +363,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
                        task_cputime(t, &utime, &stime);
                        times->utime += utime;
                        times->stime += stime;
-                       times->sum_exec_runtime += task_sched_runtime(t);
+                       times->sum_exec_runtime += read_sum_exec_runtime(t);
                }
                /* If lockless access failed, take the lock. */
                nextseq = 1;
index 9747796569997a5b195144a1e1456f934fce25b9..37e2449186c4f5effb60de3fdc0b85c78732c79a 100644 (file)
@@ -243,10 +243,8 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq);
 static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p)
 {
        struct rq *later_rq = NULL;
-       bool fallback = false;
 
        later_rq = find_lock_later_rq(p, rq);
-
        if (!later_rq) {
                int cpu;
 
@@ -254,7 +252,6 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p
                 * If we cannot preempt any rq, fall back to pick any
                 * online cpu.
                 */
-               fallback = true;
                cpu = cpumask_any_and(cpu_active_mask, tsk_cpus_allowed(p));
                if (cpu >= nr_cpu_ids) {
                        /*
@@ -274,16 +271,7 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p
                double_lock_balance(rq, later_rq);
        }
 
-       /*
-        * By now the task is replenished and enqueued; migrate it.
-        */
-       deactivate_task(rq, p, 0);
        set_task_cpu(p, later_rq->cpu);
-       activate_task(later_rq, p, 0);
-
-       if (!fallback)
-               resched_curr(later_rq);
-
        double_unlock_balance(later_rq, rq);
 
        return later_rq;
@@ -346,12 +334,12 @@ static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p,
  * one, and to (try to!) reconcile itself with its own scheduling
  * parameters.
  */
-static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se,
-                                      struct sched_dl_entity *pi_se)
+static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se)
 {
        struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
        struct rq *rq = rq_of_dl_rq(dl_rq);
 
+       WARN_ON(dl_se->dl_boosted);
        WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline));
 
        /*
@@ -367,8 +355,8 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se,
         * future; in fact, we must consider execution overheads (time
         * spent on hardirq context, etc.).
         */
-       dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
-       dl_se->runtime = pi_se->dl_runtime;
+       dl_se->deadline = rq_clock(rq) + dl_se->dl_deadline;
+       dl_se->runtime = dl_se->dl_runtime;
 }
 
 /*
@@ -641,29 +629,31 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
                goto unlock;
        }
 
-       enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
-       if (dl_task(rq->curr))
-               check_preempt_curr_dl(rq, p, 0);
-       else
-               resched_curr(rq);
-
 #ifdef CONFIG_SMP
-       /*
-        * Perform balancing operations here; after the replenishments.  We
-        * cannot drop rq->lock before this, otherwise the assertion in
-        * start_dl_timer() about not missing updates is not true.
-        *
-        * If we find that the rq the task was on is no longer available, we
-        * need to select a new rq.
-        *
-        * XXX figure out if select_task_rq_dl() deals with offline cpus.
-        */
        if (unlikely(!rq->online)) {
+               /*
+                * If the runqueue is no longer available, migrate the
+                * task elsewhere. This necessarily changes rq.
+                */
                lockdep_unpin_lock(&rq->lock, rf.cookie);
                rq = dl_task_offline_migration(rq, p);
                rf.cookie = lockdep_pin_lock(&rq->lock);
+
+               /*
+                * Now that the task has been migrated to the new RQ and we
+                * have that locked, proceed as normal and enqueue the task
+                * there.
+                */
        }
+#endif
+
+       enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
+       if (dl_task(rq->curr))
+               check_preempt_curr_dl(rq, p, 0);
+       else
+               resched_curr(rq);
 
+#ifdef CONFIG_SMP
        /*
         * Queueing this task back might have overloaded rq, check if we need
         * to kick someone away.
@@ -797,7 +787,7 @@ static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
        if (dl_rq->earliest_dl.curr == 0 ||
            dl_time_before(deadline, dl_rq->earliest_dl.curr)) {
                dl_rq->earliest_dl.curr = deadline;
-               cpudl_set(&rq->rd->cpudl, rq->cpu, deadline, 1);
+               cpudl_set(&rq->rd->cpudl, rq->cpu, deadline);
        }
 }
 
@@ -812,14 +802,14 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
        if (!dl_rq->dl_nr_running) {
                dl_rq->earliest_dl.curr = 0;
                dl_rq->earliest_dl.next = 0;
-               cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0);
+               cpudl_clear(&rq->rd->cpudl, rq->cpu);
        } else {
                struct rb_node *leftmost = dl_rq->rb_leftmost;
                struct sched_dl_entity *entry;
 
                entry = rb_entry(leftmost, struct sched_dl_entity, rb_node);
                dl_rq->earliest_dl.curr = entry->deadline;
-               cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline, 1);
+               cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline);
        }
 }
 
@@ -1670,7 +1660,7 @@ static void rq_online_dl(struct rq *rq)
 
        cpudl_set_freecpu(&rq->rd->cpudl, rq->cpu);
        if (rq->dl.dl_nr_running > 0)
-               cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr, 1);
+               cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr);
 }
 
 /* Assumes rq->lock is held */
@@ -1679,7 +1669,7 @@ static void rq_offline_dl(struct rq *rq)
        if (rq->dl.overloaded)
                dl_clear_overload(rq);
 
-       cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0);
+       cpudl_clear(&rq->rd->cpudl, rq->cpu);
        cpudl_clear_freecpu(&rq->rd->cpudl, rq->cpu);
 }
 
@@ -1722,10 +1712,20 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
  */
 static void switched_to_dl(struct rq *rq, struct task_struct *p)
 {
+
+       /* If p is not queued we will update its parameters at next wakeup. */
+       if (!task_on_rq_queued(p))
+               return;
+
+       /*
+        * If p is boosted we already updated its params in
+        * rt_mutex_setprio()->enqueue_task(..., ENQUEUE_REPLENISH),
+        * p's deadline being now already after rq_clock(rq).
+        */
        if (dl_time_before(p->dl.deadline, rq_clock(rq)))
-               setup_new_dl_entity(&p->dl, &p->dl);
+               setup_new_dl_entity(&p->dl);
 
-       if (task_on_rq_queued(p) && rq->curr != p) {
+       if (rq->curr != p) {
 #ifdef CONFIG_SMP
                if (tsk_nr_cpus_allowed(p) > 1 && rq->dl.overloaded)
                        queue_push_tasks(rq);
index 2a0a9995256d9e920d3c94cf944cd6acb61fa627..13935886a4711b2efd576f8890e1564f54991653 100644 (file)
@@ -369,8 +369,12 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
 
 #define P(F) \
        SEQ_printf(m, "  .%-30s: %lld\n", #F, (long long)F)
+#define P_SCHEDSTAT(F) \
+       SEQ_printf(m, "  .%-30s: %lld\n", #F, (long long)schedstat_val(F))
 #define PN(F) \
        SEQ_printf(m, "  .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
+#define PN_SCHEDSTAT(F) \
+       SEQ_printf(m, "  .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(F)))
 
        if (!se)
                return;
@@ -378,26 +382,27 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
        PN(se->exec_start);
        PN(se->vruntime);
        PN(se->sum_exec_runtime);
-#ifdef CONFIG_SCHEDSTATS
        if (schedstat_enabled()) {
-               PN(se->statistics.wait_start);
-               PN(se->statistics.sleep_start);
-               PN(se->statistics.block_start);
-               PN(se->statistics.sleep_max);
-               PN(se->statistics.block_max);
-               PN(se->statistics.exec_max);
-               PN(se->statistics.slice_max);
-               PN(se->statistics.wait_max);
-               PN(se->statistics.wait_sum);
-               P(se->statistics.wait_count);
+               PN_SCHEDSTAT(se->statistics.wait_start);
+               PN_SCHEDSTAT(se->statistics.sleep_start);
+               PN_SCHEDSTAT(se->statistics.block_start);
+               PN_SCHEDSTAT(se->statistics.sleep_max);
+               PN_SCHEDSTAT(se->statistics.block_max);
+               PN_SCHEDSTAT(se->statistics.exec_max);
+               PN_SCHEDSTAT(se->statistics.slice_max);
+               PN_SCHEDSTAT(se->statistics.wait_max);
+               PN_SCHEDSTAT(se->statistics.wait_sum);
+               P_SCHEDSTAT(se->statistics.wait_count);
        }
-#endif
        P(se->load.weight);
 #ifdef CONFIG_SMP
        P(se->avg.load_avg);
        P(se->avg.util_avg);
 #endif
+
+#undef PN_SCHEDSTAT
 #undef PN
+#undef P_SCHEDSTAT
 #undef P
 }
 #endif
@@ -429,9 +434,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
                p->prio);
 
        SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
-               SPLIT_NS(schedstat_val(p, se.statistics.wait_sum)),
+               SPLIT_NS(schedstat_val_or_zero(p->se.statistics.wait_sum)),
                SPLIT_NS(p->se.sum_exec_runtime),
-               SPLIT_NS(schedstat_val(p, se.statistics.sum_sleep_runtime)));
+               SPLIT_NS(schedstat_val_or_zero(p->se.statistics.sum_sleep_runtime)));
 
 #ifdef CONFIG_NUMA_BALANCING
        SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p));
@@ -626,9 +631,7 @@ do {                                                                        \
 #undef P64
 #endif
 
-#ifdef CONFIG_SCHEDSTATS
-#define P(n) SEQ_printf(m, "  .%-30s: %d\n", #n, rq->n);
-
+#define P(n) SEQ_printf(m, "  .%-30s: %d\n", #n, schedstat_val(rq->n));
        if (schedstat_enabled()) {
                P(yld_count);
                P(sched_count);
@@ -636,9 +639,8 @@ do {                                                                        \
                P(ttwu_count);
                P(ttwu_local);
        }
-
 #undef P
-#endif
+
        spin_lock_irqsave(&sched_debug_lock, flags);
        print_cfs_stats(m, cpu);
        print_rt_stats(m, cpu);
@@ -868,10 +870,14 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
        SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F)
 #define P(F) \
        SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F)
+#define P_SCHEDSTAT(F) \
+       SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)schedstat_val(p->F))
 #define __PN(F) \
        SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
 #define PN(F) \
        SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
+#define PN_SCHEDSTAT(F) \
+       SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(p->F)))
 
        PN(se.exec_start);
        PN(se.vruntime);
@@ -881,37 +887,36 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 
        P(se.nr_migrations);
 
-#ifdef CONFIG_SCHEDSTATS
        if (schedstat_enabled()) {
                u64 avg_atom, avg_per_cpu;
 
-               PN(se.statistics.sum_sleep_runtime);
-               PN(se.statistics.wait_start);
-               PN(se.statistics.sleep_start);
-               PN(se.statistics.block_start);
-               PN(se.statistics.sleep_max);
-               PN(se.statistics.block_max);
-               PN(se.statistics.exec_max);
-               PN(se.statistics.slice_max);
-               PN(se.statistics.wait_max);
-               PN(se.statistics.wait_sum);
-               P(se.statistics.wait_count);
-               PN(se.statistics.iowait_sum);
-               P(se.statistics.iowait_count);
-               P(se.statistics.nr_migrations_cold);
-               P(se.statistics.nr_failed_migrations_affine);
-               P(se.statistics.nr_failed_migrations_running);
-               P(se.statistics.nr_failed_migrations_hot);
-               P(se.statistics.nr_forced_migrations);
-               P(se.statistics.nr_wakeups);
-               P(se.statistics.nr_wakeups_sync);
-               P(se.statistics.nr_wakeups_migrate);
-               P(se.statistics.nr_wakeups_local);
-               P(se.statistics.nr_wakeups_remote);
-               P(se.statistics.nr_wakeups_affine);
-               P(se.statistics.nr_wakeups_affine_attempts);
-               P(se.statistics.nr_wakeups_passive);
-               P(se.statistics.nr_wakeups_idle);
+               PN_SCHEDSTAT(se.statistics.sum_sleep_runtime);
+               PN_SCHEDSTAT(se.statistics.wait_start);
+               PN_SCHEDSTAT(se.statistics.sleep_start);
+               PN_SCHEDSTAT(se.statistics.block_start);
+               PN_SCHEDSTAT(se.statistics.sleep_max);
+               PN_SCHEDSTAT(se.statistics.block_max);
+               PN_SCHEDSTAT(se.statistics.exec_max);
+               PN_SCHEDSTAT(se.statistics.slice_max);
+               PN_SCHEDSTAT(se.statistics.wait_max);
+               PN_SCHEDSTAT(se.statistics.wait_sum);
+               P_SCHEDSTAT(se.statistics.wait_count);
+               PN_SCHEDSTAT(se.statistics.iowait_sum);
+               P_SCHEDSTAT(se.statistics.iowait_count);
+               P_SCHEDSTAT(se.statistics.nr_migrations_cold);
+               P_SCHEDSTAT(se.statistics.nr_failed_migrations_affine);
+               P_SCHEDSTAT(se.statistics.nr_failed_migrations_running);
+               P_SCHEDSTAT(se.statistics.nr_failed_migrations_hot);
+               P_SCHEDSTAT(se.statistics.nr_forced_migrations);
+               P_SCHEDSTAT(se.statistics.nr_wakeups);
+               P_SCHEDSTAT(se.statistics.nr_wakeups_sync);
+               P_SCHEDSTAT(se.statistics.nr_wakeups_migrate);
+               P_SCHEDSTAT(se.statistics.nr_wakeups_local);
+               P_SCHEDSTAT(se.statistics.nr_wakeups_remote);
+               P_SCHEDSTAT(se.statistics.nr_wakeups_affine);
+               P_SCHEDSTAT(se.statistics.nr_wakeups_affine_attempts);
+               P_SCHEDSTAT(se.statistics.nr_wakeups_passive);
+               P_SCHEDSTAT(se.statistics.nr_wakeups_idle);
 
                avg_atom = p->se.sum_exec_runtime;
                if (nr_switches)
@@ -930,7 +935,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
                __PN(avg_atom);
                __PN(avg_per_cpu);
        }
-#endif
+
        __P(nr_switches);
        SEQ_printf(m, "%-45s:%21Ld\n",
                   "nr_voluntary_switches", (long long)p->nvcsw);
@@ -947,8 +952,10 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 #endif
        P(policy);
        P(prio);
+#undef PN_SCHEDSTAT
 #undef PN
 #undef __PN
+#undef P_SCHEDSTAT
 #undef P
 #undef __P
 
index e6a9dee0d34f9fb9a61f3e33c80e057a25ef1bcc..07aaa7f085974663e7716757e80b520c925d52f7 100644 (file)
@@ -114,6 +114,12 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
 unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
 #endif
 
+/*
+ * The margin used when comparing utilization with CPU capacity:
+ * util * 1024 < capacity * margin
+ */
+unsigned int capacity_margin = 1280; /* ~20% */
+
 static inline void update_load_add(struct load_weight *lw, unsigned long inc)
 {
        lw->weight += inc;
@@ -656,7 +662,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 }
 
 #ifdef CONFIG_SMP
-static int select_idle_sibling(struct task_struct *p, int cpu);
+static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
 static unsigned long task_h_load(struct task_struct *p);
 
 /*
@@ -726,7 +732,6 @@ void post_init_entity_util_avg(struct sched_entity *se)
        struct sched_avg *sa = &se->avg;
        long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
        u64 now = cfs_rq_clock_task(cfs_rq);
-       int tg_update;
 
        if (cap > 0) {
                if (cfs_rq->avg.util_avg != 0) {
@@ -759,10 +764,9 @@ void post_init_entity_util_avg(struct sched_entity *se)
                }
        }
 
-       tg_update = update_cfs_rq_load_avg(now, cfs_rq, false);
+       update_cfs_rq_load_avg(now, cfs_rq, false);
        attach_entity_load_avg(cfs_rq, se);
-       if (tg_update)
-               update_tg_load_avg(cfs_rq, false);
+       update_tg_load_avg(cfs_rq, false);
 }
 
 #else /* !CONFIG_SMP */
@@ -799,7 +803,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
                      max(delta_exec, curr->statistics.exec_max));
 
        curr->sum_exec_runtime += delta_exec;
-       schedstat_add(cfs_rqexec_clock, delta_exec);
+       schedstat_add(cfs_rq->exec_clock, delta_exec);
 
        curr->vruntime += calc_delta_fair(delta_exec, curr);
        update_min_vruntime(cfs_rq);
@@ -820,26 +824,34 @@ static void update_curr_fair(struct rq *rq)
        update_curr(cfs_rq_of(&rq->curr->se));
 }
 
-#ifdef CONFIG_SCHEDSTATS
 static inline void
 update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-       u64 wait_start = rq_clock(rq_of(cfs_rq));
+       u64 wait_start, prev_wait_start;
+
+       if (!schedstat_enabled())
+               return;
+
+       wait_start = rq_clock(rq_of(cfs_rq));
+       prev_wait_start = schedstat_val(se->statistics.wait_start);
 
        if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
-           likely(wait_start > se->statistics.wait_start))
-               wait_start -= se->statistics.wait_start;
+           likely(wait_start > prev_wait_start))
+               wait_start -= prev_wait_start;
 
-       se->statistics.wait_start = wait_start;
+       schedstat_set(se->statistics.wait_start, wait_start);
 }
 
-static void
+static inline void
 update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
        struct task_struct *p;
        u64 delta;
 
-       delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start;
+       if (!schedstat_enabled())
+               return;
+
+       delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(se->statistics.wait_start);
 
        if (entity_is_task(se)) {
                p = task_of(se);
@@ -849,35 +861,114 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
                         * time stamp can be adjusted to accumulate wait time
                         * prior to migration.
                         */
-                       se->statistics.wait_start = delta;
+                       schedstat_set(se->statistics.wait_start, delta);
                        return;
                }
                trace_sched_stat_wait(p, delta);
        }
 
-       se->statistics.wait_max = max(se->statistics.wait_max, delta);
-       se->statistics.wait_count++;
-       se->statistics.wait_sum += delta;
-       se->statistics.wait_start = 0;
+       schedstat_set(se->statistics.wait_max,
+                     max(schedstat_val(se->statistics.wait_max), delta));
+       schedstat_inc(se->statistics.wait_count);
+       schedstat_add(se->statistics.wait_sum, delta);
+       schedstat_set(se->statistics.wait_start, 0);
+}
+
+static inline void
+update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+       struct task_struct *tsk = NULL;
+       u64 sleep_start, block_start;
+
+       if (!schedstat_enabled())
+               return;
+
+       sleep_start = schedstat_val(se->statistics.sleep_start);
+       block_start = schedstat_val(se->statistics.block_start);
+
+       if (entity_is_task(se))
+               tsk = task_of(se);
+
+       if (sleep_start) {
+               u64 delta = rq_clock(rq_of(cfs_rq)) - sleep_start;
+
+               if ((s64)delta < 0)
+                       delta = 0;
+
+               if (unlikely(delta > schedstat_val(se->statistics.sleep_max)))
+                       schedstat_set(se->statistics.sleep_max, delta);
+
+               schedstat_set(se->statistics.sleep_start, 0);
+               schedstat_add(se->statistics.sum_sleep_runtime, delta);
+
+               if (tsk) {
+                       account_scheduler_latency(tsk, delta >> 10, 1);
+                       trace_sched_stat_sleep(tsk, delta);
+               }
+       }
+       if (block_start) {
+               u64 delta = rq_clock(rq_of(cfs_rq)) - block_start;
+
+               if ((s64)delta < 0)
+                       delta = 0;
+
+               if (unlikely(delta > schedstat_val(se->statistics.block_max)))
+                       schedstat_set(se->statistics.block_max, delta);
+
+               schedstat_set(se->statistics.block_start, 0);
+               schedstat_add(se->statistics.sum_sleep_runtime, delta);
+
+               if (tsk) {
+                       if (tsk->in_iowait) {
+                               schedstat_add(se->statistics.iowait_sum, delta);
+                               schedstat_inc(se->statistics.iowait_count);
+                               trace_sched_stat_iowait(tsk, delta);
+                       }
+
+                       trace_sched_stat_blocked(tsk, delta);
+
+                       /*
+                        * Blocking time is in units of nanosecs, so shift by
+                        * 20 to get a milliseconds-range estimation of the
+                        * amount of time that the task spent sleeping:
+                        */
+                       if (unlikely(prof_on == SLEEP_PROFILING)) {
+                               profile_hits(SLEEP_PROFILING,
+                                               (void *)get_wchan(tsk),
+                                               delta >> 20);
+                       }
+                       account_scheduler_latency(tsk, delta >> 10, 0);
+               }
+       }
 }
 
 /*
  * Task is being enqueued - update stats:
  */
 static inline void
-update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
+       if (!schedstat_enabled())
+               return;
+
        /*
         * Are we enqueueing a waiting task? (for current tasks
         * a dequeue/enqueue event is a NOP)
         */
        if (se != cfs_rq->curr)
                update_stats_wait_start(cfs_rq, se);
+
+       if (flags & ENQUEUE_WAKEUP)
+               update_stats_enqueue_sleeper(cfs_rq, se);
 }
 
 static inline void
 update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
+
+       if (!schedstat_enabled())
+               return;
+
        /*
         * Mark the end of the wait period if dequeueing a
         * waiting task:
@@ -885,40 +976,18 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
        if (se != cfs_rq->curr)
                update_stats_wait_end(cfs_rq, se);
 
-       if (flags & DEQUEUE_SLEEP) {
-               if (entity_is_task(se)) {
-                       struct task_struct *tsk = task_of(se);
+       if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) {
+               struct task_struct *tsk = task_of(se);
 
-                       if (tsk->state & TASK_INTERRUPTIBLE)
-                               se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
-                       if (tsk->state & TASK_UNINTERRUPTIBLE)
-                               se->statistics.block_start = rq_clock(rq_of(cfs_rq));
-               }
+               if (tsk->state & TASK_INTERRUPTIBLE)
+                       schedstat_set(se->statistics.sleep_start,
+                                     rq_clock(rq_of(cfs_rq)));
+               if (tsk->state & TASK_UNINTERRUPTIBLE)
+                       schedstat_set(se->statistics.block_start,
+                                     rq_clock(rq_of(cfs_rq)));
        }
-
-}
-#else
-static inline void
-update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
 }
 
-static inline void
-update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-}
-
-static inline void
-update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-}
-
-static inline void
-update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
-{
-}
-#endif
-
 /*
  * We are picking a new current task - update its stats:
  */
@@ -1514,7 +1583,8 @@ balance:
         * Call select_idle_sibling to maybe find a better one.
         */
        if (!cur)
-               env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
+               env->dst_cpu = select_idle_sibling(env->p, env->src_cpu,
+                                                  env->dst_cpu);
 
 assign:
        task_numa_assign(env, cur, imp);
@@ -2803,9 +2873,21 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
 }
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-/*
- * Updating tg's load_avg is necessary before update_cfs_share (which is done)
- * and effective_load (which is not done because it is too costly).
+/**
+ * update_tg_load_avg - update the tg's load avg
+ * @cfs_rq: the cfs_rq whose avg changed
+ * @force: update regardless of how small the difference
+ *
+ * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load.
+ * However, because tg->load_avg is a global value there are performance
+ * considerations.
+ *
+ * In order to avoid having to look at the other cfs_rq's, we use a
+ * differential update where we store the last value we propagated. This in
+ * turn allows skipping updates if the differential is 'small'.
+ *
+ * Updating tg's load_avg is necessary before update_cfs_share() (which is
+ * done) and effective_load() (which is not done because it is too costly).
  */
 static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
 {
@@ -2925,10 +3007,10 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
  *
  * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
  *
- * Returns true if the load decayed or we removed utilization. It is expected
- * that one calls update_tg_load_avg() on this condition, but after you've
- * modified the cfs_rq avg (attach/detach), such that we propagate the new
- * avg up.
+ * Returns true if the load decayed or we removed load.
+ *
+ * Since both these conditions indicate a changed cfs_rq->avg.load we should
+ * call update_tg_load_avg() when this function returns true.
  */
 static inline int
 update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
@@ -3174,68 +3256,6 @@ static inline int idle_balance(struct rq *rq)
 
 #endif /* CONFIG_SMP */
 
-static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-#ifdef CONFIG_SCHEDSTATS
-       struct task_struct *tsk = NULL;
-
-       if (entity_is_task(se))
-               tsk = task_of(se);
-
-       if (se->statistics.sleep_start) {
-               u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start;
-
-               if ((s64)delta < 0)
-                       delta = 0;
-
-               if (unlikely(delta > se->statistics.sleep_max))
-                       se->statistics.sleep_max = delta;
-
-               se->statistics.sleep_start = 0;
-               se->statistics.sum_sleep_runtime += delta;
-
-               if (tsk) {
-                       account_scheduler_latency(tsk, delta >> 10, 1);
-                       trace_sched_stat_sleep(tsk, delta);
-               }
-       }
-       if (se->statistics.block_start) {
-               u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start;
-
-               if ((s64)delta < 0)
-                       delta = 0;
-
-               if (unlikely(delta > se->statistics.block_max))
-                       se->statistics.block_max = delta;
-
-               se->statistics.block_start = 0;
-               se->statistics.sum_sleep_runtime += delta;
-
-               if (tsk) {
-                       if (tsk->in_iowait) {
-                               se->statistics.iowait_sum += delta;
-                               se->statistics.iowait_count++;
-                               trace_sched_stat_iowait(tsk, delta);
-                       }
-
-                       trace_sched_stat_blocked(tsk, delta);
-
-                       /*
-                        * Blocking time is in units of nanosecs, so shift by
-                        * 20 to get a milliseconds-range estimation of the
-                        * amount of time that the task spent sleeping:
-                        */
-                       if (unlikely(prof_on == SLEEP_PROFILING)) {
-                               profile_hits(SLEEP_PROFILING,
-                                               (void *)get_wchan(tsk),
-                                               delta >> 20);
-                       }
-                       account_scheduler_latency(tsk, delta >> 10, 0);
-               }
-       }
-#endif
-}
-
 static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 #ifdef CONFIG_SCHED_DEBUG
@@ -3245,7 +3265,7 @@ static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
                d = -d;
 
        if (d > 3*sysctl_sched_latency)
-               schedstat_inc(cfs_rqnr_spread_over);
+               schedstat_inc(cfs_rq->nr_spread_over);
 #endif
 }
 
@@ -3362,17 +3382,12 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
        account_entity_enqueue(cfs_rq, se);
        update_cfs_shares(cfs_rq);
 
-       if (flags & ENQUEUE_WAKEUP) {
+       if (flags & ENQUEUE_WAKEUP)
                place_entity(cfs_rq, se, 0);
-               if (schedstat_enabled())
-                       enqueue_sleeper(cfs_rq, se);
-       }
 
        check_schedstat_required();
-       if (schedstat_enabled()) {
-               update_stats_enqueue(cfs_rq, se);
-               check_spread(cfs_rq, se);
-       }
+       update_stats_enqueue(cfs_rq, se, flags);
+       check_spread(cfs_rq, se);
        if (!curr)
                __enqueue_entity(cfs_rq, se);
        se->on_rq = 1;
@@ -3439,8 +3454,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
        update_curr(cfs_rq);
        dequeue_entity_load_avg(cfs_rq, se);
 
-       if (schedstat_enabled())
-               update_stats_dequeue(cfs_rq, se, flags);
+       update_stats_dequeue(cfs_rq, se, flags);
 
        clear_buddies(cfs_rq, se);
 
@@ -3514,25 +3528,25 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
                 * a CPU. So account for the time it spent waiting on the
                 * runqueue.
                 */
-               if (schedstat_enabled())
-                       update_stats_wait_end(cfs_rq, se);
+               update_stats_wait_end(cfs_rq, se);
                __dequeue_entity(cfs_rq, se);
                update_load_avg(se, 1);
        }
 
        update_stats_curr_start(cfs_rq, se);
        cfs_rq->curr = se;
-#ifdef CONFIG_SCHEDSTATS
+
        /*
         * Track our maximum slice length, if the CPU's load is at
         * least twice that of our own weight (i.e. dont track it
         * when there are only lesser-weight tasks around):
         */
        if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
-               se->statistics.slice_max = max(se->statistics.slice_max,
-                       se->sum_exec_runtime - se->prev_sum_exec_runtime);
+               schedstat_set(se->statistics.slice_max,
+                       max((u64)schedstat_val(se->statistics.slice_max),
+                           se->sum_exec_runtime - se->prev_sum_exec_runtime));
        }
-#endif
+
        se->prev_sum_exec_runtime = se->sum_exec_runtime;
 }
 
@@ -3611,13 +3625,10 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
        /* throttle cfs_rqs exceeding runtime */
        check_cfs_rq_runtime(cfs_rq);
 
-       if (schedstat_enabled()) {
-               check_spread(cfs_rq, prev);
-               if (prev->on_rq)
-                       update_stats_wait_start(cfs_rq, prev);
-       }
+       check_spread(cfs_rq, prev);
 
        if (prev->on_rq) {
+               update_stats_wait_start(cfs_rq, prev);
                /* Put 'current' back into the tree. */
                __enqueue_entity(cfs_rq, prev);
                /* in !on_rq case, update occurred at dequeue */
@@ -5082,18 +5093,18 @@ static int wake_wide(struct task_struct *p)
        return 1;
 }
 
-static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
+static int wake_affine(struct sched_domain *sd, struct task_struct *p,
+                      int prev_cpu, int sync)
 {
        s64 this_load, load;
        s64 this_eff_load, prev_eff_load;
-       int idx, this_cpu, prev_cpu;
+       int idx, this_cpu;
        struct task_group *tg;
        unsigned long weight;
        int balanced;
 
        idx       = sd->wake_idx;
        this_cpu  = smp_processor_id();
-       prev_cpu  = task_cpu(p);
        load      = source_load(prev_cpu, idx);
        this_load = target_load(this_cpu, idx);
 
@@ -5137,13 +5148,13 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
 
        balanced = this_eff_load <= prev_eff_load;
 
-       schedstat_inc(pse.statistics.nr_wakeups_affine_attempts);
+       schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
 
        if (!balanced)
                return 0;
 
-       schedstat_inc(sdttwu_move_affine);
-       schedstat_inc(pse.statistics.nr_wakeups_affine);
+       schedstat_inc(sd->ttwu_move_affine);
+       schedstat_inc(p->se.statistics.nr_wakeups_affine);
 
        return 1;
 }
@@ -5219,6 +5230,10 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
        int shallowest_idle_cpu = -1;
        int i;
 
+       /* Check if we have any choice: */
+       if (group->group_weight == 1)
+               return cpumask_first(sched_group_cpus(group));
+
        /* Traverse only the allowed CPUs */
        for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
                if (idle_cpu(i)) {
@@ -5258,11 +5273,10 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
 /*
  * Try and locate an idle CPU in the sched_domain.
  */
-static int select_idle_sibling(struct task_struct *p, int target)
+static int select_idle_sibling(struct task_struct *p, int prev, int target)
 {
        struct sched_domain *sd;
        struct sched_group *sg;
-       int i = task_cpu(p);
 
        if (idle_cpu(target))
                return target;
@@ -5270,8 +5284,8 @@ static int select_idle_sibling(struct task_struct *p, int target)
        /*
         * If the prevous cpu is cache affine and idle, don't be stupid.
         */
-       if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
-               return i;
+       if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev))
+               return prev;
 
        /*
         * Otherwise, iterate the domains and find an eligible idle cpu.
@@ -5292,6 +5306,8 @@ static int select_idle_sibling(struct task_struct *p, int target)
        for_each_lower_domain(sd) {
                sg = sd->groups;
                do {
+                       int i;
+
                        if (!cpumask_intersects(sched_group_cpus(sg),
                                                tsk_cpus_allowed(p)))
                                goto next;
@@ -5351,6 +5367,32 @@ static int cpu_util(int cpu)
        return (util >= capacity) ? capacity : util;
 }
 
+static inline int task_util(struct task_struct *p)
+{
+       return p->se.avg.util_avg;
+}
+
+/*
+ * Disable WAKE_AFFINE in the case where task @p doesn't fit in the
+ * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu.
+ *
+ * In that case WAKE_AFFINE doesn't make sense and we'll let
+ * BALANCE_WAKE sort things out.
+ */
+static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
+{
+       long min_cap, max_cap;
+
+       min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu));
+       max_cap = cpu_rq(cpu)->rd->max_cpu_capacity;
+
+       /* Minimum capacity is close to max, no need to abort wake_affine */
+       if (max_cap - min_cap < max_cap >> 3)
+               return 0;
+
+       return min_cap * 1024 < task_util(p) * capacity_margin;
+}
+
 /*
  * select_task_rq_fair: Select target runqueue for the waking task in domains
  * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
@@ -5374,7 +5416,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
 
        if (sd_flag & SD_BALANCE_WAKE) {
                record_wakee(p);
-               want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
+               want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu)
+                             && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
        }
 
        rcu_read_lock();
@@ -5400,13 +5443,13 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
 
        if (affine_sd) {
                sd = NULL; /* Prefer wake_affine over balance flags */
-               if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
+               if (cpu != prev_cpu && wake_affine(affine_sd, p, prev_cpu, sync))
                        new_cpu = cpu;
        }
 
        if (!sd) {
                if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
-                       new_cpu = select_idle_sibling(p, new_cpu);
+                       new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
 
        } else while (sd) {
                struct sched_group *group;
@@ -5930,7 +5973,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
  *
  * The adjacency matrix of the resulting graph is given by:
  *
- *             log_2 n     
+ *             log_2 n
  *   A_i,j = \Union     (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1)  (6)
  *             k = 0
  *
@@ -5976,7 +6019,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
  *
  * [XXX write more on how we solve this.. _after_ merging pjt's patches that
  *      rewrite all of this once again.]
- */ 
+ */
 
 static unsigned long __read_mostly max_load_balance_interval = HZ/10;
 
@@ -6124,7 +6167,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
        if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
                int cpu;
 
-               schedstat_inc(pse.statistics.nr_failed_migrations_affine);
+               schedstat_inc(p->se.statistics.nr_failed_migrations_affine);
 
                env->flags |= LBF_SOME_PINNED;
 
@@ -6155,7 +6198,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
        env->flags &= ~LBF_ALL_PINNED;
 
        if (task_running(env->src_rq, p)) {
-               schedstat_inc(pse.statistics.nr_failed_migrations_running);
+               schedstat_inc(p->se.statistics.nr_failed_migrations_running);
                return 0;
        }
 
@@ -6172,13 +6215,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
        if (tsk_cache_hot <= 0 ||
            env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
                if (tsk_cache_hot == 1) {
-                       schedstat_inc(env->sdlb_hot_gained[env->idle]);
-                       schedstat_inc(pse.statistics.nr_forced_migrations);
+                       schedstat_inc(env->sd->lb_hot_gained[env->idle]);
+                       schedstat_inc(p->se.statistics.nr_forced_migrations);
                }
                return 1;
        }
 
-       schedstat_inc(pse.statistics.nr_failed_migrations_hot);
+       schedstat_inc(p->se.statistics.nr_failed_migrations_hot);
        return 0;
 }
 
@@ -6218,7 +6261,7 @@ static struct task_struct *detach_one_task(struct lb_env *env)
                 * so we can safely collect stats here rather than
                 * inside detach_tasks().
                 */
-               schedstat_inc(env->sdlb_gained[env->idle]);
+               schedstat_inc(env->sd->lb_gained[env->idle]);
                return p;
        }
        return NULL;
@@ -6310,7 +6353,7 @@ next:
         * so we can safely collect detach_one_task() stats here rather
         * than inside detach_one_task().
         */
-       schedstat_add(env->sdlb_gained[env->idle], detached);
+       schedstat_add(env->sd->lb_gained[env->idle], detached);
 
        return detached;
 }
@@ -6638,7 +6681,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
                /*
                 * !SD_OVERLAP domains can assume that child groups
                 * span the current group.
-                */ 
+                */
 
                group = child->groups;
                do {
@@ -7138,7 +7181,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
                load_above_capacity = busiest->sum_nr_running * SCHED_CAPACITY_SCALE;
                if (load_above_capacity > busiest->group_capacity) {
                        load_above_capacity -= busiest->group_capacity;
-                       load_above_capacity *= NICE_0_LOAD;
+                       load_above_capacity *= scale_load_down(NICE_0_LOAD);
                        load_above_capacity /= busiest->group_capacity;
                } else
                        load_above_capacity = ~0UL;
@@ -7451,7 +7494,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 
        cpumask_copy(cpus, cpu_active_mask);
 
-       schedstat_inc(sdlb_count[idle]);
+       schedstat_inc(sd->lb_count[idle]);
 
 redo:
        if (!should_we_balance(&env)) {
@@ -7461,19 +7504,19 @@ redo:
 
        group = find_busiest_group(&env);
        if (!group) {
-               schedstat_inc(sdlb_nobusyg[idle]);
+               schedstat_inc(sd->lb_nobusyg[idle]);
                goto out_balanced;
        }
 
        busiest = find_busiest_queue(&env, group);
        if (!busiest) {
-               schedstat_inc(sdlb_nobusyq[idle]);
+               schedstat_inc(sd->lb_nobusyq[idle]);
                goto out_balanced;
        }
 
        BUG_ON(busiest == env.dst_rq);
 
-       schedstat_add(sdlb_imbalance[idle], env.imbalance);
+       schedstat_add(sd->lb_imbalance[idle], env.imbalance);
 
        env.src_cpu = busiest->cpu;
        env.src_rq = busiest;
@@ -7580,7 +7623,7 @@ more_balance:
        }
 
        if (!ld_moved) {
-               schedstat_inc(sdlb_failed[idle]);
+               schedstat_inc(sd->lb_failed[idle]);
                /*
                 * Increment the failure counter only on periodic balance.
                 * We do not want newidle balance, which can be very
@@ -7663,7 +7706,7 @@ out_all_pinned:
         * we can't migrate them. Let the imbalance flag set so parent level
         * can try to migrate them.
         */
-       schedstat_inc(sdlb_balanced[idle]);
+       schedstat_inc(sd->lb_balanced[idle]);
 
        sd->nr_balance_failed = 0;
 
@@ -7695,11 +7738,12 @@ get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
 }
 
 static inline void
-update_next_balance(struct sched_domain *sd, int cpu_busy, unsigned long *next_balance)
+update_next_balance(struct sched_domain *sd, unsigned long *next_balance)
 {
        unsigned long interval, next;
 
-       interval = get_sd_balance_interval(sd, cpu_busy);
+       /* used by idle balance, so cpu_busy = 0 */
+       interval = get_sd_balance_interval(sd, 0);
        next = sd->last_balance + interval;
 
        if (time_after(*next_balance, next))
@@ -7729,7 +7773,7 @@ static int idle_balance(struct rq *this_rq)
                rcu_read_lock();
                sd = rcu_dereference_check_sched_domain(this_rq->sd);
                if (sd)
-                       update_next_balance(sd, 0, &next_balance);
+                       update_next_balance(sd, &next_balance);
                rcu_read_unlock();
 
                goto out;
@@ -7747,7 +7791,7 @@ static int idle_balance(struct rq *this_rq)
                        continue;
 
                if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
-                       update_next_balance(sd, 0, &next_balance);
+                       update_next_balance(sd, &next_balance);
                        break;
                }
 
@@ -7765,7 +7809,7 @@ static int idle_balance(struct rq *this_rq)
                        curr_cost += domain_cost;
                }
 
-               update_next_balance(sd, 0, &next_balance);
+               update_next_balance(sd, &next_balance);
 
                /*
                 * Stop searching for tasks to pull if there are
@@ -7855,15 +7899,15 @@ static int active_load_balance_cpu_stop(void *data)
                        .idle           = CPU_IDLE,
                };
 
-               schedstat_inc(sdalb_count);
+               schedstat_inc(sd->alb_count);
 
                p = detach_one_task(&env);
                if (p) {
-                       schedstat_inc(sdalb_pushed);
+                       schedstat_inc(sd->alb_pushed);
                        /* Active balancing done, reset the failure counter. */
                        sd->nr_balance_failed = 0;
                } else {
-                       schedstat_inc(sdalb_failed);
+                       schedstat_inc(sd->alb_failed);
                }
        }
        rcu_read_unlock();
@@ -8432,7 +8476,6 @@ static void detach_task_cfs_rq(struct task_struct *p)
        struct sched_entity *se = &p->se;
        struct cfs_rq *cfs_rq = cfs_rq_of(se);
        u64 now = cfs_rq_clock_task(cfs_rq);
-       int tg_update;
 
        if (!vruntime_normalized(p)) {
                /*
@@ -8444,10 +8487,9 @@ static void detach_task_cfs_rq(struct task_struct *p)
        }
 
        /* Catch up with the cfs_rq and remove our load when we leave */
-       tg_update = update_cfs_rq_load_avg(now, cfs_rq, false);
+       update_cfs_rq_load_avg(now, cfs_rq, false);
        detach_entity_load_avg(cfs_rq, se);
-       if (tg_update)
-               update_tg_load_avg(cfs_rq, false);
+       update_tg_load_avg(cfs_rq, false);
 }
 
 static void attach_task_cfs_rq(struct task_struct *p)
@@ -8455,7 +8497,6 @@ static void attach_task_cfs_rq(struct task_struct *p)
        struct sched_entity *se = &p->se;
        struct cfs_rq *cfs_rq = cfs_rq_of(se);
        u64 now = cfs_rq_clock_task(cfs_rq);
-       int tg_update;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
        /*
@@ -8466,10 +8507,9 @@ static void attach_task_cfs_rq(struct task_struct *p)
 #endif
 
        /* Synchronize task with its cfs_rq */
-       tg_update = update_cfs_rq_load_avg(now, cfs_rq, false);
+       update_cfs_rq_load_avg(now, cfs_rq, false);
        attach_entity_load_avg(cfs_rq, se);
-       if (tg_update)
-               update_tg_load_avg(cfs_rq, false);
+       update_tg_load_avg(cfs_rq, false);
 
        if (!vruntime_normalized(p))
                se->vruntime += cfs_rq->min_vruntime;
index 2ce5458bbe1d1ad82c16f7cbe38d79c23f91ebb0..dedc81ecbb2e2ad221f937584bc39070d1e3fe3d 100644 (file)
@@ -28,7 +28,7 @@ pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct pin_cookie c
 {
        put_prev_task(rq, prev);
 
-       schedstat_inc(rqsched_goidle);
+       schedstat_inc(rq->sched_goidle);
        return rq->idle;
 }
 
index b7fc1ced438018c8a63c58d3a120a7a8a6348827..8ac71b73a8e880bfff0891633f1d8ed0518d7965 100644 (file)
@@ -565,6 +565,8 @@ struct root_domain {
         */
        cpumask_var_t rto_mask;
        struct cpupri cpupri;
+
+       unsigned long max_cpu_capacity;
 };
 
 extern struct root_domain def_root_domain;
@@ -597,7 +599,6 @@ struct rq {
 #ifdef CONFIG_SMP
        unsigned long last_load_update_tick;
 #endif /* CONFIG_SMP */
-       u64 nohz_stamp;
        unsigned long nohz_flags;
 #endif /* CONFIG_NO_HZ_COMMON */
 #ifdef CONFIG_NO_HZ_FULL
index 78955cbea31c4a5378bc0b73f5f3bbcbdfa23d75..34659a853505bec33603c423cd0ff71b989410a6 100644 (file)
@@ -29,11 +29,12 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
        if (rq)
                rq->rq_sched_info.run_delay += delta;
 }
-# define schedstat_enabled()           static_branch_unlikely(&sched_schedstats)
-# define schedstat_inc(rq, field)      do { if (schedstat_enabled()) { (rq)->field++; } } while (0)
-# define schedstat_add(rq, field, amt) do { if (schedstat_enabled()) { (rq)->field += (amt); } } while (0)
-# define schedstat_set(var, val)       do { if (schedstat_enabled()) { var = (val); } } while (0)
-# define schedstat_val(rq, field)      ((schedstat_enabled()) ? (rq)->field : 0)
+#define schedstat_enabled()            static_branch_unlikely(&sched_schedstats)
+#define schedstat_inc(var)             do { if (schedstat_enabled()) { var++; } } while (0)
+#define schedstat_add(var, amt)                do { if (schedstat_enabled()) { var += (amt); } } while (0)
+#define schedstat_set(var, val)                do { if (schedstat_enabled()) { var = (val); } } while (0)
+#define schedstat_val(var)             (var)
+#define schedstat_val_or_zero(var)     ((schedstat_enabled()) ? (var) : 0)
 
 #else /* !CONFIG_SCHEDSTATS */
 static inline void
@@ -45,12 +46,13 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
 static inline void
 rq_sched_info_depart(struct rq *rq, unsigned long long delta)
 {}
-# define schedstat_enabled()           0
-# define schedstat_inc(rq, field)      do { } while (0)
-# define schedstat_add(rq, field, amt) do { } while (0)
-# define schedstat_set(var, val)       do { } while (0)
-# define schedstat_val(rq, field)      0
-#endif
+#define schedstat_enabled()            0
+#define schedstat_inc(var)             do { } while (0)
+#define schedstat_add(var, amt)                do { } while (0)
+#define schedstat_set(var, val)                do { } while (0)
+#define schedstat_val(var)             0
+#define schedstat_val_or_zero(var)     0
+#endif /* CONFIG_SCHEDSTATS */
 
 #ifdef CONFIG_SCHED_INFO
 static inline void sched_info_reset_dequeued(struct task_struct *t)
index 3aa642d39c0370849372cab0c95f9a5d7760e794..f4f6137941cbefd27bf068bf1c9d93a9b5976499 100644 (file)
@@ -14,6 +14,7 @@
 #include <linux/smp.h>
 #include <linux/cpu.h>
 #include <linux/sched.h>
+#include <linux/hypervisor.h>
 
 #include "smpboot.h"
 
@@ -724,3 +725,53 @@ void wake_up_all_idle_cpus(void)
        preempt_enable();
 }
 EXPORT_SYMBOL_GPL(wake_up_all_idle_cpus);
+
+/**
+ * smp_call_on_cpu - Call a function on a specific cpu
+ *
+ * Used to call a function on a specific cpu and wait for it to return.
+ * Optionally make sure the call is done on a specified physical cpu via vcpu
+ * pinning in order to support virtualized environments.
+ */
+struct smp_call_on_cpu_struct {
+       struct work_struct      work;
+       struct completion       done;
+       int                     (*func)(void *);
+       void                    *data;
+       int                     ret;
+       int                     cpu;
+};
+
+static void smp_call_on_cpu_callback(struct work_struct *work)
+{
+       struct smp_call_on_cpu_struct *sscs;
+
+       sscs = container_of(work, struct smp_call_on_cpu_struct, work);
+       if (sscs->cpu >= 0)
+               hypervisor_pin_vcpu(sscs->cpu);
+       sscs->ret = sscs->func(sscs->data);
+       if (sscs->cpu >= 0)
+               hypervisor_pin_vcpu(-1);
+
+       complete(&sscs->done);
+}
+
+int smp_call_on_cpu(unsigned int cpu, int (*func)(void *), void *par, bool phys)
+{
+       struct smp_call_on_cpu_struct sscs = {
+               .work = __WORK_INITIALIZER(sscs.work, smp_call_on_cpu_callback),
+               .done = COMPLETION_INITIALIZER_ONSTACK(sscs.done),
+               .func = func,
+               .data = par,
+               .cpu  = phys ? cpu : -1,
+       };
+
+       if (cpu >= nr_cpu_ids || !cpu_online(cpu))
+               return -ENXIO;
+
+       queue_work_on(cpu, system_wq, &sscs.work);
+       wait_for_completion(&sscs.done);
+
+       return sscs.ret;
+}
+EXPORT_SYMBOL_GPL(smp_call_on_cpu);
index 34033fd09c8c66ffe1249db174a3db780e28df94..ad039266a7b382c4804e238f4f9897c3a04bf7a0 100644 (file)
@@ -700,7 +700,7 @@ void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu)
        BUG();
 }
 
-static void takeover_tasklets(unsigned int cpu)
+static int takeover_tasklets(unsigned int cpu)
 {
        /* CPU is dead, so no lock needed. */
        local_irq_disable();
@@ -723,27 +723,12 @@ static void takeover_tasklets(unsigned int cpu)
        raise_softirq_irqoff(HI_SOFTIRQ);
 
        local_irq_enable();
+       return 0;
 }
+#else
+#define takeover_tasklets      NULL
 #endif /* CONFIG_HOTPLUG_CPU */
 
-static int cpu_callback(struct notifier_block *nfb, unsigned long action,
-                       void *hcpu)
-{
-       switch (action) {
-#ifdef CONFIG_HOTPLUG_CPU
-       case CPU_DEAD:
-       case CPU_DEAD_FROZEN:
-               takeover_tasklets((unsigned long)hcpu);
-               break;
-#endif /* CONFIG_HOTPLUG_CPU */
-       }
-       return NOTIFY_OK;
-}
-
-static struct notifier_block cpu_nfb = {
-       .notifier_call = cpu_callback
-};
-
 static struct smp_hotplug_thread softirq_threads = {
        .store                  = &ksoftirqd,
        .thread_should_run      = ksoftirqd_should_run,
@@ -753,8 +738,8 @@ static struct smp_hotplug_thread softirq_threads = {
 
 static __init int spawn_ksoftirqd(void)
 {
-       register_cpu_notifier(&cpu_nfb);
-
+       cpuhp_setup_state_nocalls(CPUHP_SOFTIRQ_DEAD, "softirq:dead", NULL,
+                                 takeover_tasklets);
        BUG_ON(smpboot_register_percpu_thread(&softirq_threads));
 
        return 0;
index 6a5a310a1a5351205ef9bd633a39efa427897556..7e4fad75acaaddbac4d11ade83f6e14cc22aee8c 100644 (file)
@@ -600,9 +600,18 @@ static void __clocksource_select(bool skipcur)
                 */
                if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && oneshot) {
                        /* Override clocksource cannot be used. */
-                       pr_warn("Override clocksource %s is not HRT compatible - cannot switch while in HRT/NOHZ mode\n",
-                               cs->name);
-                       override_name[0] = 0;
+                       if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
+                               pr_warn("Override clocksource %s is unstable and not HRT compatible - cannot switch while in HRT/NOHZ mode\n",
+                                       cs->name);
+                               override_name[0] = 0;
+                       } else {
+                               /*
+                                * The override cannot be currently verified.
+                                * Deferring to let the watchdog check.
+                                */
+                               pr_info("Override clocksource %s is not currently HRT compatible - deferring\n",
+                                       cs->name);
+                       }
                } else
                        /* Override clocksource can be used. */
                        best = cs;
index 9ba7c820fc231b2bfef4dd94228dbb6cf8c6f877..bb5ec425dfe0bafdadc51b3fe409c20bdbf30926 100644 (file)
@@ -307,7 +307,7 @@ EXPORT_SYMBOL_GPL(__ktime_divns);
  */
 ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs)
 {
-       ktime_t res = ktime_add(lhs, rhs);
+       ktime_t res = ktime_add_unsafe(lhs, rhs);
 
        /*
         * We use KTIME_SEC_MAX here, the maximum timeout which we can
@@ -703,7 +703,7 @@ static void clock_was_set_work(struct work_struct *work)
 static DECLARE_WORK(hrtimer_work, clock_was_set_work);
 
 /*
- * Called from timekeeping and resume code to reprogramm the hrtimer
+ * Called from timekeeping and resume code to reprogram the hrtimer
  * interrupt device on all cpus.
  */
 void clock_was_set_delayed(void)
@@ -1241,7 +1241,7 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
 
        /*
         * Note: We clear the running state after enqueue_hrtimer and
-        * we do not reprogramm the event hardware. Happens either in
+        * we do not reprogram the event hardware. Happens either in
         * hrtimer_start_range_ns() or in hrtimer_interrupt()
         *
         * Note: Because we dropped the cpu_base->lock above,
index 667b9335f5d6bf4b86338eadf840dc22e8fc94d5..bd62fb8e8e77f691959859b8ffa1c2af4ab670dc 100644 (file)
@@ -780,7 +780,7 @@ struct timespec64 timespec64_add_safe(const struct timespec64 lhs,
 {
        struct timespec64 res;
 
-       set_normalized_timespec64(&res, lhs.tv_sec + rhs.tv_sec,
+       set_normalized_timespec64(&res, (timeu64_t) lhs.tv_sec + rhs.tv_sec,
                        lhs.tv_nsec + rhs.tv_nsec);
 
        if (unlikely(res.tv_sec < lhs.tv_sec || res.tv_sec < rhs.tv_sec)) {
index 107310a6f36f43a47300c46a4bd3b51a6b5b70fa..ca9fb800336b2904e4c19069ad8c9e29ea926830 100644 (file)
@@ -75,5 +75,7 @@ void tk_debug_account_sleep_time(struct timespec64 *t)
        int bin = min(fls(t->tv_sec), NUM_BINS-1);
 
        sleep_time_bin[bin]++;
+       pr_info("Suspended for %lld.%03lu seconds\n", (s64)t->tv_sec,
+                       t->tv_nsec / NSEC_PER_MSEC);
 }
 
index f4b86e8ca1e77dba8d969aa009491876186f457f..ba3326785ca4cffcf245da527f0386b2d8c23608 100644 (file)
@@ -24,11 +24,6 @@ config HAVE_FUNCTION_GRAPH_TRACER
        help
          See Documentation/trace/ftrace-design.txt
 
-config HAVE_FUNCTION_GRAPH_FP_TEST
-       bool
-       help
-         See Documentation/trace/ftrace-design.txt
-
 config HAVE_DYNAMIC_FTRACE
        bool
        help
index dade4c9559cc036c1b6aa8567abf0b0887847923..1e2ce3b52e51fdd843995350a3dffc11c98b584e 100644 (file)
@@ -4123,6 +4123,30 @@ static const char readme_msg[] =
        "\t\t\t  traces\n"
 #endif
 #endif /* CONFIG_STACK_TRACER */
+#ifdef CONFIG_KPROBE_EVENT
+       "  kprobe_events\t\t- Add/remove/show the kernel dynamic events\n"
+       "\t\t\t  Write into this file to define/undefine new trace events.\n"
+#endif
+#ifdef CONFIG_UPROBE_EVENT
+       "  uprobe_events\t\t- Add/remove/show the userspace dynamic events\n"
+       "\t\t\t  Write into this file to define/undefine new trace events.\n"
+#endif
+#if defined(CONFIG_KPROBE_EVENT) || defined(CONFIG_UPROBE_EVENT)
+       "\t  accepts: event-definitions (one definition per line)\n"
+       "\t   Format: p|r[:[<group>/]<event>] <place> [<args>]\n"
+       "\t           -:[<group>/]<event>\n"
+#ifdef CONFIG_KPROBE_EVENT
+       "\t    place: [<module>:]<symbol>[+<offset>]|<memaddr>\n"
+#endif
+#ifdef CONFIG_UPROBE_EVENT
+       "\t    place: <path>:<offset>\n"
+#endif
+       "\t     args: <name>=fetcharg[:type]\n"
+       "\t fetcharg: %<register>, @<address>, @<symbol>[+|-<offset>],\n"
+       "\t           $stack<index>, $stack, $retval, $comm\n"
+       "\t     type: s8/16/32/64, u8/16/32/64, x8/16/32/64, string,\n"
+       "\t           b<bit-width>@<bit-offset>/<container-size>\n"
+#endif
        "  events/\t\t- Directory containing all trace event subsystems:\n"
        "      enable\t\t- Write 0/1 to enable/disable tracing of all events\n"
        "  events/<system>/\t- Directory containing all trace events for <system>:\n"
index 7363ccf795125ce14d89a3303300e47a096c877e..0cbe38a844fa9b57563cb650f93973b7d35e71f0 100644 (file)
@@ -119,7 +119,7 @@ print_graph_duration(struct trace_array *tr, unsigned long long duration,
 /* Add a function return address to the trace stack on thread info.*/
 int
 ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth,
-                        unsigned long frame_pointer)
+                        unsigned long frame_pointer, unsigned long *retp)
 {
        unsigned long long calltime;
        int index;
@@ -171,7 +171,12 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth,
        current->ret_stack[index].func = func;
        current->ret_stack[index].calltime = calltime;
        current->ret_stack[index].subtime = 0;
+#ifdef HAVE_FUNCTION_GRAPH_FP_TEST
        current->ret_stack[index].fp = frame_pointer;
+#endif
+#ifdef HAVE_FUNCTION_GRAPH_RET_ADDR_PTR
+       current->ret_stack[index].retp = retp;
+#endif
        *depth = current->curr_ret_stack;
 
        return 0;
@@ -204,7 +209,7 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret,
                return;
        }
 
-#if defined(CONFIG_HAVE_FUNCTION_GRAPH_FP_TEST) && !defined(CC_USING_FENTRY)
+#ifdef HAVE_FUNCTION_GRAPH_FP_TEST
        /*
         * The arch may choose to record the frame pointer used
         * and check it here to make sure that it is what we expect it
@@ -279,6 +284,64 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
        return ret;
 }
 
+/**
+ * ftrace_graph_ret_addr - convert a potentially modified stack return address
+ *                        to its original value
+ *
+ * This function can be called by stack unwinding code to convert a found stack
+ * return address ('ret') to its original value, in case the function graph
+ * tracer has modified it to be 'return_to_handler'.  If the address hasn't
+ * been modified, the unchanged value of 'ret' is returned.
+ *
+ * 'idx' is a state variable which should be initialized by the caller to zero
+ * before the first call.
+ *
+ * 'retp' is a pointer to the return address on the stack.  It's ignored if
+ * the arch doesn't have HAVE_FUNCTION_GRAPH_RET_ADDR_PTR defined.
+ */
+#ifdef HAVE_FUNCTION_GRAPH_RET_ADDR_PTR
+unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx,
+                                   unsigned long ret, unsigned long *retp)
+{
+       int index = task->curr_ret_stack;
+       int i;
+
+       if (ret != (unsigned long)return_to_handler)
+               return ret;
+
+       if (index < -1)
+               index += FTRACE_NOTRACE_DEPTH;
+
+       if (index < 0)
+               return ret;
+
+       for (i = 0; i <= index; i++)
+               if (task->ret_stack[i].retp == retp)
+                       return task->ret_stack[i].ret;
+
+       return ret;
+}
+#else /* !HAVE_FUNCTION_GRAPH_RET_ADDR_PTR */
+unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx,
+                                   unsigned long ret, unsigned long *retp)
+{
+       int task_idx;
+
+       if (ret != (unsigned long)return_to_handler)
+               return ret;
+
+       task_idx = task->curr_ret_stack;
+
+       if (!task->ret_stack || task_idx < *idx)
+               return ret;
+
+       task_idx -= *idx;
+       (*idx)++;
+
+       return task->ret_stack[task_idx].ret;
+}
+#endif /* HAVE_FUNCTION_GRAPH_RET_ADDR_PTR */
+
 int __trace_graph_entry(struct trace_array *tr,
                                struct ftrace_graph_ent *trace,
                                unsigned long flags,
index 9aedb0b06683765f4293eb750465ed5849c2902f..eb6c9f1d3a932f93a9a88b3d6b9c742457f84d11 100644 (file)
@@ -253,6 +253,10 @@ static const struct fetch_type kprobes_fetch_type_table[] = {
        ASSIGN_FETCH_TYPE(s16, u16, 1),
        ASSIGN_FETCH_TYPE(s32, u32, 1),
        ASSIGN_FETCH_TYPE(s64, u64, 1),
+       ASSIGN_FETCH_TYPE_ALIAS(x8,  u8,  u8,  0),
+       ASSIGN_FETCH_TYPE_ALIAS(x16, u16, u16, 0),
+       ASSIGN_FETCH_TYPE_ALIAS(x32, u32, u32, 0),
+       ASSIGN_FETCH_TYPE_ALIAS(x64, u64, u64, 0),
 
        ASSIGN_FETCH_TYPE_END
 };
index 74e80a582c28988fbc0227453378d36bf4346999..8c0553d9afd3f2563756641cfa0e659a4ab7ff99 100644 (file)
@@ -36,24 +36,28 @@ const char *reserved_field_names[] = {
 };
 
 /* Printing  in basic type function template */
-#define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt)                                \
-int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, const char *name,  \
+#define DEFINE_BASIC_PRINT_TYPE_FUNC(tname, type, fmt)                 \
+int PRINT_TYPE_FUNC_NAME(tname)(struct trace_seq *s, const char *name, \
                                void *data, void *ent)                  \
 {                                                                      \
        trace_seq_printf(s, " %s=" fmt, name, *(type *)data);           \
        return !trace_seq_has_overflowed(s);                            \
 }                                                                      \
-const char PRINT_TYPE_FMT_NAME(type)[] = fmt;                          \
-NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(type));
-
-DEFINE_BASIC_PRINT_TYPE_FUNC(u8 , "0x%x")
-DEFINE_BASIC_PRINT_TYPE_FUNC(u16, "0x%x")
-DEFINE_BASIC_PRINT_TYPE_FUNC(u32, "0x%x")
-DEFINE_BASIC_PRINT_TYPE_FUNC(u64, "0x%Lx")
-DEFINE_BASIC_PRINT_TYPE_FUNC(s8,  "%d")
-DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d")
-DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%d")
-DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%Ld")
+const char PRINT_TYPE_FMT_NAME(tname)[] = fmt;                         \
+NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(tname));
+
+DEFINE_BASIC_PRINT_TYPE_FUNC(u8,  u8,  "%u")
+DEFINE_BASIC_PRINT_TYPE_FUNC(u16, u16, "%u")
+DEFINE_BASIC_PRINT_TYPE_FUNC(u32, u32, "%u")
+DEFINE_BASIC_PRINT_TYPE_FUNC(u64, u64, "%Lu")
+DEFINE_BASIC_PRINT_TYPE_FUNC(s8,  s8,  "%d")
+DEFINE_BASIC_PRINT_TYPE_FUNC(s16, s16, "%d")
+DEFINE_BASIC_PRINT_TYPE_FUNC(s32, s32, "%d")
+DEFINE_BASIC_PRINT_TYPE_FUNC(s64, s64, "%Ld")
+DEFINE_BASIC_PRINT_TYPE_FUNC(x8,  u8,  "0x%x")
+DEFINE_BASIC_PRINT_TYPE_FUNC(x16, u16, "0x%x")
+DEFINE_BASIC_PRINT_TYPE_FUNC(x32, u32, "0x%x")
+DEFINE_BASIC_PRINT_TYPE_FUNC(x64, u64, "0x%Lx")
 
 /* Print type function for string type */
 int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, const char *name,
index 45400ca5ded1acf673d713851f930425b1d20849..0c0ae54d44c616d5d09876165c2516c3e032af77 100644 (file)
@@ -149,6 +149,11 @@ DECLARE_BASIC_PRINT_TYPE_FUNC(s8);
 DECLARE_BASIC_PRINT_TYPE_FUNC(s16);
 DECLARE_BASIC_PRINT_TYPE_FUNC(s32);
 DECLARE_BASIC_PRINT_TYPE_FUNC(s64);
+DECLARE_BASIC_PRINT_TYPE_FUNC(x8);
+DECLARE_BASIC_PRINT_TYPE_FUNC(x16);
+DECLARE_BASIC_PRINT_TYPE_FUNC(x32);
+DECLARE_BASIC_PRINT_TYPE_FUNC(x64);
+
 DECLARE_BASIC_PRINT_TYPE_FUNC(string);
 
 #define FETCH_FUNC_NAME(method, type)  fetch_##method##_##type
@@ -203,7 +208,7 @@ DEFINE_FETCH_##method(u32)          \
 DEFINE_FETCH_##method(u64)
 
 /* Default (unsigned long) fetch type */
-#define __DEFAULT_FETCH_TYPE(t) u##t
+#define __DEFAULT_FETCH_TYPE(t) x##t
 #define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t)
 #define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG)
 #define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE)
@@ -234,6 +239,10 @@ ASSIGN_FETCH_FUNC(file_offset, ftype),                     \
 #define ASSIGN_FETCH_TYPE(ptype, ftype, sign)                  \
        __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #ptype)
 
+/* If ptype is an alias of atype, use this macro (show atype in format) */
+#define ASSIGN_FETCH_TYPE_ALIAS(ptype, atype, ftype, sign)             \
+       __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #atype)
+
 #define ASSIGN_FETCH_TYPE_END {}
 
 #define FETCH_TYPE_STRING      0
index c53485441c88ad142cbab7bb7edfd4e5139832d1..7a687320f8671c7e82ee1c5ebca6dc7d2b77380c 100644 (file)
@@ -211,6 +211,10 @@ static const struct fetch_type uprobes_fetch_type_table[] = {
        ASSIGN_FETCH_TYPE(s16, u16, 1),
        ASSIGN_FETCH_TYPE(s32, u32, 1),
        ASSIGN_FETCH_TYPE(s64, u64, 1),
+       ASSIGN_FETCH_TYPE_ALIAS(x8,  u8,  u8,  0),
+       ASSIGN_FETCH_TYPE_ALIAS(x16, u16, u16, 0),
+       ASSIGN_FETCH_TYPE_ALIAS(x32, u32, u32, 0),
+       ASSIGN_FETCH_TYPE_ALIAS(x64, u64, u64, 0),
 
        ASSIGN_FETCH_TYPE_END
 };
index 1760bf3d14636266eae1f81aa84f277076c4335d..ee81ac9af4caff801a531a9219efd19637b76688 100644 (file)
@@ -6,6 +6,7 @@
 #include <linux/kernel.h>
 #include <linux/export.h>
 #include <linux/smp.h>
+#include <linux/hypervisor.h>
 
 int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
                                int wait)
@@ -82,3 +83,20 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
        preempt_enable();
 }
 EXPORT_SYMBOL(on_each_cpu_cond);
+
+int smp_call_on_cpu(unsigned int cpu, int (*func)(void *), void *par, bool phys)
+{
+       int ret;
+
+       if (cpu != 0)
+               return -ENXIO;
+
+       if (phys)
+               hypervisor_pin_vcpu(0);
+       ret = func(par);
+       if (phys)
+               hypervisor_pin_vcpu(-1);
+
+       return ret;
+}
+EXPORT_SYMBOL_GPL(smp_call_on_cpu);
index fcfa1939ac41abe768b401f235b8afd7c6d75dbe..06f02f6aecd2b7ee974497941d0c6b7302f83f1c 100644 (file)
@@ -22,6 +22,7 @@
 #include <linux/stacktrace.h>
 #include <linux/dma-debug.h>
 #include <linux/spinlock.h>
+#include <linux/vmalloc.h>
 #include <linux/debugfs.h>
 #include <linux/uaccess.h>
 #include <linux/export.h>
@@ -1164,11 +1165,32 @@ static void check_unmap(struct dma_debug_entry *ref)
        put_hash_bucket(bucket, &flags);
 }
 
-static void check_for_stack(struct device *dev, void *addr)
+static void check_for_stack(struct device *dev,
+                           struct page *page, size_t offset)
 {
-       if (object_is_on_stack(addr))
-               err_printk(dev, NULL, "DMA-API: device driver maps memory from "
-                               "stack [addr=%p]\n", addr);
+       void *addr;
+       struct vm_struct *stack_vm_area = task_stack_vm_area(current);
+
+       if (!stack_vm_area) {
+               /* Stack is direct-mapped. */
+               if (PageHighMem(page))
+                       return;
+               addr = page_address(page) + offset;
+               if (object_is_on_stack(addr))
+                       err_printk(dev, NULL, "DMA-API: device driver maps memory from stack [addr=%p]\n", addr);
+       } else {
+               /* Stack is vmalloced. */
+               int i;
+
+               for (i = 0; i < stack_vm_area->nr_pages; i++) {
+                       if (page != stack_vm_area->pages[i])
+                               continue;
+
+                       addr = (u8 *)current->stack + i * PAGE_SIZE + offset;
+                       err_printk(dev, NULL, "DMA-API: device driver maps memory from stack [probable addr=%p]\n", addr);
+                       break;
+               }
+       }
 }
 
 static inline bool overlap(void *addr, unsigned long len, void *start, void *end)
@@ -1291,10 +1313,11 @@ void debug_dma_map_page(struct device *dev, struct page *page, size_t offset,
        if (map_single)
                entry->type = dma_debug_single;
 
+       check_for_stack(dev, page, offset);
+
        if (!PageHighMem(page)) {
                void *addr = page_address(page) + offset;
 
-               check_for_stack(dev, addr);
                check_for_illegal_area(dev, addr, size);
        }
 
@@ -1386,8 +1409,9 @@ void debug_dma_map_sg(struct device *dev, struct scatterlist *sg,
                entry->sg_call_ents   = nents;
                entry->sg_mapped_ents = mapped_ents;
 
+               check_for_stack(dev, sg_page(s), s->offset);
+
                if (!PageHighMem(sg_page(s))) {
-                       check_for_stack(dev, sg_virt(s));
                        check_for_illegal_area(dev, sg_virt(s), sg_dma_len(s));
                }
 
index a4830f0325fe255ef4e106bb9b982bf7a2747415..7b35ee3894ee4a8970a502826e860a644222e4a9 100644 (file)
 #include <linux/mmu_notifier.h>
 #include <linux/migrate.h>
 #include <linux/perf_event.h>
+#include <linux/pkeys.h>
 #include <linux/ksm.h>
 #include <linux/pkeys.h>
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/cacheflush.h>
+#include <asm/mmu_context.h>
 #include <asm/tlbflush.h>
 
 #include "internal.h"
@@ -352,8 +354,11 @@ fail:
        return error;
 }
 
-SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
-               unsigned long, prot)
+/*
+ * pkey==-1 when doing a legacy mprotect()
+ */
+static int do_mprotect_pkey(unsigned long start, size_t len,
+               unsigned long prot, int pkey)
 {
        unsigned long nstart, end, tmp, reqprot;
        struct vm_area_struct *vma, *prev;
@@ -382,6 +387,14 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
        if (down_write_killable(&current->mm->mmap_sem))
                return -EINTR;
 
+       /*
+        * If userspace did not allocate the pkey, do not let
+        * them use it here.
+        */
+       error = -EINVAL;
+       if ((pkey != -1) && !mm_pkey_is_allocated(current->mm, pkey))
+               goto out;
+
        vma = find_vma(current->mm, start);
        error = -ENOMEM;
        if (!vma)
@@ -408,8 +421,9 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
                prev = vma;
 
        for (nstart = start ; ; ) {
+               unsigned long mask_off_old_flags;
                unsigned long newflags;
-               int pkey = arch_override_mprotect_pkey(vma, prot, -1);
+               int new_vma_pkey;
 
                /* Here we know that vma->vm_start <= nstart < vma->vm_end. */
 
@@ -417,8 +431,17 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
                if (rier && (vma->vm_flags & VM_MAYEXEC))
                        prot |= PROT_EXEC;
 
-               newflags = calc_vm_prot_bits(prot, pkey);
-               newflags |= (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC));
+               /*
+                * Each mprotect() call explicitly passes r/w/x permissions.
+                * If a permission is not passed to mprotect(), it must be
+                * cleared from the VMA.
+                */
+               mask_off_old_flags = VM_READ | VM_WRITE | VM_EXEC |
+                                       ARCH_VM_PKEY_FLAGS;
+
+               new_vma_pkey = arch_override_mprotect_pkey(vma, prot, pkey);
+               newflags = calc_vm_prot_bits(prot, new_vma_pkey);
+               newflags |= (vma->vm_flags & ~mask_off_old_flags);
 
                /* newflags >> 4 shift VM_MAY% in place of VM_% */
                if ((newflags & ~(newflags >> 4)) & (VM_READ | VM_WRITE | VM_EXEC)) {
@@ -454,3 +477,60 @@ out:
        up_write(&current->mm->mmap_sem);
        return error;
 }
+
+SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
+               unsigned long, prot)
+{
+       return do_mprotect_pkey(start, len, prot, -1);
+}
+
+SYSCALL_DEFINE4(pkey_mprotect, unsigned long, start, size_t, len,
+               unsigned long, prot, int, pkey)
+{
+       return do_mprotect_pkey(start, len, prot, pkey);
+}
+
+SYSCALL_DEFINE2(pkey_alloc, unsigned long, flags, unsigned long, init_val)
+{
+       int pkey;
+       int ret;
+
+       /* No flags supported yet. */
+       if (flags)
+               return -EINVAL;
+       /* check for unsupported init values */
+       if (init_val & ~PKEY_ACCESS_MASK)
+               return -EINVAL;
+
+       down_write(&current->mm->mmap_sem);
+       pkey = mm_pkey_alloc(current->mm);
+
+       ret = -ENOSPC;
+       if (pkey == -1)
+               goto out;
+
+       ret = arch_set_user_pkey_access(current, pkey, init_val);
+       if (ret) {
+               mm_pkey_free(current->mm, pkey);
+               goto out;
+       }
+       ret = pkey;
+out:
+       up_write(&current->mm->mmap_sem);
+       return ret;
+}
+
+SYSCALL_DEFINE1(pkey_free, int, pkey)
+{
+       int ret;
+
+       down_write(&current->mm->mmap_sem);
+       ret = mm_pkey_free(current->mm, pkey);
+       up_write(&current->mm->mmap_sem);
+
+       /*
+        * We could provie warnings or errors if any VMA still
+        * has the pkey set here.
+        */
+       return ret;
+}
index f4cd7d8005c9071dc1caf77a155eb3f2dc611b61..28d6f36a2d79c26c30e25d52d01a8dc95dbaa4ac 100644 (file)
@@ -2080,26 +2080,12 @@ void writeback_set_ratelimit(void)
                ratelimit_pages = 16;
 }
 
-static int
-ratelimit_handler(struct notifier_block *self, unsigned long action,
-                 void *hcpu)
+static int page_writeback_cpu_online(unsigned int cpu)
 {
-
-       switch (action & ~CPU_TASKS_FROZEN) {
-       case CPU_ONLINE:
-       case CPU_DEAD:
-               writeback_set_ratelimit();
-               return NOTIFY_OK;
-       default:
-               return NOTIFY_DONE;
-       }
+       writeback_set_ratelimit();
+       return 0;
 }
 
-static struct notifier_block ratelimit_nb = {
-       .notifier_call  = ratelimit_handler,
-       .next           = NULL,
-};
-
 /*
  * Called early on to tune the page writeback dirty limits.
  *
@@ -2122,8 +2108,10 @@ void __init page_writeback_init(void)
 {
        BUG_ON(wb_domain_init(&global_wb_domain, GFP_KERNEL));
 
-       writeback_set_ratelimit();
-       register_cpu_notifier(&ratelimit_nb);
+       cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mm/writeback:online",
+                         page_writeback_cpu_online, NULL);
+       cpuhp_setup_state(CPUHP_MM_WRITEBACK_DEAD, "mm/writeback:dead", NULL,
+                         page_writeback_cpu_online);
 }
 
 /**
index b67271024135aff957f1361c3e327da41915b4c2..090fb26b3a39b4feba105650f2a663f5c9972064 100644 (file)
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -886,6 +886,7 @@ static int init_cache_node(struct kmem_cache *cachep, int node, gfp_t gfp)
        return 0;
 }
 
+#if (defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG)) || defined(CONFIG_SMP)
 /*
  * Allocates and initializes node for a node on each slab cache, used for
  * either memory or cpu hotplug.  If memory is being hot-added, the kmem_cache_node
@@ -908,6 +909,7 @@ static int init_cache_node_node(int node)
 
        return 0;
 }
+#endif
 
 static int setup_kmem_cache_node(struct kmem_cache *cachep,
                                int node, gfp_t gfp, bool force_change)
@@ -975,6 +977,8 @@ fail:
        return ret;
 }
 
+#ifdef CONFIG_SMP
+
 static void cpuup_canceled(long cpu)
 {
        struct kmem_cache *cachep;
@@ -1075,65 +1079,54 @@ bad:
        return -ENOMEM;
 }
 
-static int cpuup_callback(struct notifier_block *nfb,
-                                   unsigned long action, void *hcpu)
+int slab_prepare_cpu(unsigned int cpu)
 {
-       long cpu = (long)hcpu;
-       int err = 0;
+       int err;
 
-       switch (action) {
-       case CPU_UP_PREPARE:
-       case CPU_UP_PREPARE_FROZEN:
-               mutex_lock(&slab_mutex);
-               err = cpuup_prepare(cpu);
-               mutex_unlock(&slab_mutex);
-               break;
-       case CPU_ONLINE:
-       case CPU_ONLINE_FROZEN:
-               start_cpu_timer(cpu);
-               break;
-#ifdef CONFIG_HOTPLUG_CPU
-       case CPU_DOWN_PREPARE:
-       case CPU_DOWN_PREPARE_FROZEN:
-               /*
-                * Shutdown cache reaper. Note that the slab_mutex is
-                * held so that if cache_reap() is invoked it cannot do
-                * anything expensive but will only modify reap_work
-                * and reschedule the timer.
-               */
-               cancel_delayed_work_sync(&per_cpu(slab_reap_work, cpu));
-               /* Now the cache_reaper is guaranteed to be not running. */
-               per_cpu(slab_reap_work, cpu).work.func = NULL;
-               break;
-       case CPU_DOWN_FAILED:
-       case CPU_DOWN_FAILED_FROZEN:
-               start_cpu_timer(cpu);
-               break;
-       case CPU_DEAD:
-       case CPU_DEAD_FROZEN:
-               /*
-                * Even if all the cpus of a node are down, we don't free the
-                * kmem_cache_node of any cache. This to avoid a race between
-                * cpu_down, and a kmalloc allocation from another cpu for
-                * memory from the node of the cpu going down.  The node
-                * structure is usually allocated from kmem_cache_create() and
-                * gets destroyed at kmem_cache_destroy().
-                */
-               /* fall through */
+       mutex_lock(&slab_mutex);
+       err = cpuup_prepare(cpu);
+       mutex_unlock(&slab_mutex);
+       return err;
+}
+
+/*
+ * This is called for a failed online attempt and for a successful
+ * offline.
+ *
+ * Even if all the cpus of a node are down, we don't free the
+ * kmem_list3 of any cache. This to avoid a race between cpu_down, and
+ * a kmalloc allocation from another cpu for memory from the node of
+ * the cpu going down.  The list3 structure is usually allocated from
+ * kmem_cache_create() and gets destroyed at kmem_cache_destroy().
+ */
+int slab_dead_cpu(unsigned int cpu)
+{
+       mutex_lock(&slab_mutex);
+       cpuup_canceled(cpu);
+       mutex_unlock(&slab_mutex);
+       return 0;
+}
 #endif
-       case CPU_UP_CANCELED:
-       case CPU_UP_CANCELED_FROZEN:
-               mutex_lock(&slab_mutex);
-               cpuup_canceled(cpu);
-               mutex_unlock(&slab_mutex);
-               break;
-       }
-       return notifier_from_errno(err);
+
+static int slab_online_cpu(unsigned int cpu)
+{
+       start_cpu_timer(cpu);
+       return 0;
 }
 
-static struct notifier_block cpucache_notifier = {
-       &cpuup_callback, NULL, 0
-};
+static int slab_offline_cpu(unsigned int cpu)
+{
+       /*
+        * Shutdown cache reaper. Note that the slab_mutex is held so
+        * that if cache_reap() is invoked it cannot do anything
+        * expensive but will only modify reap_work and reschedule the
+        * timer.
+        */
+       cancel_delayed_work_sync(&per_cpu(slab_reap_work, cpu));
+       /* Now the cache_reaper is guaranteed to be not running. */
+       per_cpu(slab_reap_work, cpu).work.func = NULL;
+       return 0;
+}
 
 #if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG)
 /*
@@ -1336,12 +1329,6 @@ void __init kmem_cache_init_late(void)
        /* Done! */
        slab_state = FULL;
 
-       /*
-        * Register a cpu startup notifier callback that initializes
-        * cpu_cache_get for all new cpus
-        */
-       register_cpu_notifier(&cpucache_notifier);
-
 #ifdef CONFIG_NUMA
        /*
         * Register a memory hotplug callback that initializes and frees
@@ -1358,13 +1345,14 @@ void __init kmem_cache_init_late(void)
 
 static int __init cpucache_init(void)
 {
-       int cpu;
+       int ret;
 
        /*
         * Register the timers that return unneeded pages to the page allocator
         */
-       for_each_online_cpu(cpu)
-               start_cpu_timer(cpu);
+       ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "SLAB online",
+                               slab_online_cpu, slab_offline_cpu);
+       WARN_ON(ret < 0);
 
        /* Done! */
        slab_state = FULL;
index 9adae58462f8191b22659b1aa438ec637f6fc765..2b3e740609e92e29a7b52ddb6df6b8f0b0897136 100644 (file)
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -194,10 +194,6 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
 #define __OBJECT_POISON                0x80000000UL /* Poison object */
 #define __CMPXCHG_DOUBLE       0x40000000UL /* Use cmpxchg_double */
 
-#ifdef CONFIG_SMP
-static struct notifier_block slab_notifier;
-#endif
-
 /*
  * Tracking user of a slab.
  */
@@ -2304,6 +2300,25 @@ static void flush_all(struct kmem_cache *s)
        on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1, GFP_ATOMIC);
 }
 
+/*
+ * Use the cpu notifier to insure that the cpu slabs are flushed when
+ * necessary.
+ */
+static int slub_cpu_dead(unsigned int cpu)
+{
+       struct kmem_cache *s;
+       unsigned long flags;
+
+       mutex_lock(&slab_mutex);
+       list_for_each_entry(s, &slab_caches, list) {
+               local_irq_save(flags);
+               __flush_cpu_slab(s, cpu);
+               local_irq_restore(flags);
+       }
+       mutex_unlock(&slab_mutex);
+       return 0;
+}
+
 /*
  * Check if the objects in a per cpu structure fit numa
  * locality expectations.
@@ -4144,9 +4159,8 @@ void __init kmem_cache_init(void)
        /* Setup random freelists for each cache */
        init_freelist_randomization();
 
-#ifdef CONFIG_SMP
-       register_cpu_notifier(&slab_notifier);
-#endif
+       cpuhp_setup_state_nocalls(CPUHP_SLUB_DEAD, "slub:dead", NULL,
+                                 slub_cpu_dead);
 
        pr_info("SLUB: HWalign=%d, Order=%d-%d, MinObjects=%d, CPUs=%d, Nodes=%d\n",
                cache_line_size(),
@@ -4210,43 +4224,6 @@ int __kmem_cache_create(struct kmem_cache *s, unsigned long flags)
        return err;
 }
 
-#ifdef CONFIG_SMP
-/*
- * Use the cpu notifier to insure that the cpu slabs are flushed when
- * necessary.
- */
-static int slab_cpuup_callback(struct notifier_block *nfb,
-               unsigned long action, void *hcpu)
-{
-       long cpu = (long)hcpu;
-       struct kmem_cache *s;
-       unsigned long flags;
-
-       switch (action) {
-       case CPU_UP_CANCELED:
-       case CPU_UP_CANCELED_FROZEN:
-       case CPU_DEAD:
-       case CPU_DEAD_FROZEN:
-               mutex_lock(&slab_mutex);
-               list_for_each_entry(s, &slab_caches, list) {
-                       local_irq_save(flags);
-                       __flush_cpu_slab(s, cpu);
-                       local_irq_restore(flags);
-               }
-               mutex_unlock(&slab_mutex);
-               break;
-       default:
-               break;
-       }
-       return NOTIFY_OK;
-}
-
-static struct notifier_block slab_notifier = {
-       .notifier_call = slab_cpuup_callback
-};
-
-#endif
-
 void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller)
 {
        struct kmem_cache *s;
diff --git a/tools/include/linux/coresight-pmu.h b/tools/include/linux/coresight-pmu.h
new file mode 100644 (file)
index 0000000..7d41026
--- /dev/null
@@ -0,0 +1,39 @@
+/*
+ * Copyright(C) 2015 Linaro Limited. All rights reserved.
+ * Author: Mathieu Poirier <mathieu.poirier@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _LINUX_CORESIGHT_PMU_H
+#define _LINUX_CORESIGHT_PMU_H
+
+#define CORESIGHT_ETM_PMU_NAME "cs_etm"
+#define CORESIGHT_ETM_PMU_SEED  0x10
+
+/* ETMv3.5/PTM's ETMCR config bit */
+#define ETM_OPT_CYCACC  12
+#define ETM_OPT_TS      28
+
+static inline int coresight_get_trace_id(int cpu)
+{
+       /*
+        * A trace ID of value 0 is invalid, so let's start at some
+        * random value that fits in 7 bits and go from there.  Since
+        * the common convention is to have data trace IDs be I(N) + 1,
+        * set instruction trace IDs as a function of the CPU number.
+        */
+       return (CORESIGHT_ETM_PMU_SEED + (cpu * 2));
+}
+
+#endif
diff --git a/tools/include/linux/time64.h b/tools/include/linux/time64.h
new file mode 100644 (file)
index 0000000..df92654
--- /dev/null
@@ -0,0 +1,12 @@
+#ifndef _TOOLS_LINUX_TIME64_H
+#define _TOOLS_LINUX_TIME64_H
+
+#define MSEC_PER_SEC   1000L
+#define USEC_PER_MSEC  1000L
+#define NSEC_PER_USEC  1000L
+#define NSEC_PER_MSEC  1000000L
+#define USEC_PER_SEC   1000000L
+#define NSEC_PER_SEC   1000000000L
+#define FSEC_PER_SEC   1000000000000000LL
+
+#endif /* _LINUX_TIME64_H */
index d9836c5eb694c48c30a6d07827ee36ed086b4309..11c8d9bc762ef0c4bde99dec4292e84ff00d3477 100644 (file)
@@ -3266,6 +3266,9 @@ int main(int argc, char *argv[])
                }
        }
 
+       /* If we exit via err(), this kills all the threads, restores tty. */
+       atexit(cleanup_devices);
+
        /* We always have a console device, and it's always device 1. */
        setup_console();
 
@@ -3369,9 +3372,6 @@ int main(int argc, char *argv[])
        /* Ensure that we terminate if a device-servicing child dies. */
        signal(SIGCHLD, kill_launcher);
 
-       /* If we exit via err(), this kills all the threads, restores tty. */
-       atexit(cleanup_devices);
-
        /* If requested, chroot to a directory */
        if (chroot_path) {
                if (chroot(chroot_path) != 0)
index ba7094b945ffc7d312902dc8723acf86be68a6f5..f99f49e4a31e65faba6bad4b1c82b99a01de7ded 100644 (file)
 #define TRACEFS_MAGIC          0x74726163
 #endif
 
+#ifndef HUGETLBFS_MAGIC
+#define HUGETLBFS_MAGIC        0x958458f6
+#endif
+
 static const char * const sysfs__fs_known_mountpoints[] = {
        "/sys",
        0,
@@ -67,6 +71,10 @@ static const char * const tracefs__known_mountpoints[] = {
        0,
 };
 
+static const char * const hugetlbfs__known_mountpoints[] = {
+       0,
+};
+
 struct fs {
        const char              *name;
        const char * const      *mounts;
@@ -80,6 +88,7 @@ enum {
        FS__PROCFS  = 1,
        FS__DEBUGFS = 2,
        FS__TRACEFS = 3,
+       FS__HUGETLBFS = 4,
 };
 
 #ifndef TRACEFS_MAGIC
@@ -107,6 +116,11 @@ static struct fs fs__entries[] = {
                .mounts = tracefs__known_mountpoints,
                .magic  = TRACEFS_MAGIC,
        },
+       [FS__HUGETLBFS] = {
+               .name   = "hugetlbfs",
+               .mounts = hugetlbfs__known_mountpoints,
+               .magic  = HUGETLBFS_MAGIC,
+       },
 };
 
 static bool fs__read_mounts(struct fs *fs)
@@ -265,6 +279,7 @@ FS(sysfs,   FS__SYSFS);
 FS(procfs,  FS__PROCFS);
 FS(debugfs, FS__DEBUGFS);
 FS(tracefs, FS__TRACEFS);
+FS(hugetlbfs, FS__HUGETLBFS);
 
 int filename__read_int(const char *filename, int *value)
 {
index 16c9c2ed7c5bfb85e270399f4d6a7253d9a367b3..a63269f5d20cbe5b9c794f64ad64bf7a16d0ce7d 100644 (file)
@@ -21,6 +21,7 @@ FS(sysfs)
 FS(procfs)
 FS(debugfs)
 FS(tracefs)
+FS(hugetlbfs)
 
 #undef FS
 
index 15949e2a7805cc6e7d848ddbf2f99b249e332f3b..cb081ac59fd11457b909a0f98b398993eaf76119 100644 (file)
@@ -110,6 +110,14 @@ Given a $HOME/.perfconfig like this:
                order = caller
                sort-key = function
 
+       [report]
+               # Defaults
+               sort-order = comm,dso,symbol
+               percent-limit = 0
+               queue-size = 0
+               children = true
+               group = true
+
 Variables
 ~~~~~~~~~
 
@@ -382,6 +390,10 @@ call-graph.*::
                histogram entry. Default is 0 which means no limitation.
 
 report.*::
+       report.sort_order::
+               Allows changing the default sort order from "comm,dso,symbol" to
+               some other default, for instance "sym,dso" may be more fitting for
+               kernel developers.
        report.percent-limit::
                This one is mostly the same as call-graph.threshold but works for
                histogram entries. Entries having an overhead lower than this
index b303bcdd8ed15fb9d140e0e7369388bc714aaace..e6c9902c6d82b0e2535059c1a915356e84856589 100644 (file)
@@ -21,6 +21,8 @@ or
 'perf probe' [options] --vars='PROBEPOINT'
 or
 'perf probe' [options] --funcs
+or
+'perf probe' [options] --definition='PROBE' [...]
 
 DESCRIPTION
 -----------
@@ -34,6 +36,8 @@ OPTIONS
 -k::
 --vmlinux=PATH::
        Specify vmlinux path which has debuginfo (Dwarf binary).
+       Only when using this with --definition, you can give an offline
+       vmlinux file.
 
 -m::
 --module=MODNAME|PATH::
@@ -96,6 +100,11 @@ OPTIONS
        can also list functions in a user space executable / shared library.
        This also can accept a FILTER rule argument.
 
+-D::
+--definition=::
+       Show trace-event definition converted from given probe-event instead
+       of write it into tracing/[k,u]probe_events.
+
 --filter=FILTER::
        (Only for --vars and --funcs) Set filter. FILTER is a combination of glob
        pattern, see FILTER PATTERN for detail.
@@ -176,13 +185,12 @@ Each probe argument follows below syntax.
 
 'NAME' specifies the name of this argument (optional). You can use the name of local variable, local data structure member (e.g. var->field, var.field2), local array with fixed index (e.g. array[1], var->array[0], var->pointer[2]), or kprobe-tracer argument format (e.g. $retval, %ax, etc). Note that the name of this argument will be set as the last member name if you specify a local data structure member (e.g. field2 for 'var->field1.field2'.)
 '$vars' and '$params' special arguments are also available for NAME, '$vars' is expanded to the local variables (including function parameters) which can access at given probe point. '$params' is expanded to only the function parameters.
-'TYPE' casts the type of this argument (optional). If omitted, perf probe automatically set the type based on debuginfo. Currently, basic types (u8/u16/u32/u64/s8/s16/s32/s64), signedness casting (u/s), "string" and bitfield are supported. (see TYPES for detail)
-
+'TYPE' casts the type of this argument (optional). If omitted, perf probe automatically set the type based on debuginfo (*). Currently, basic types (u8/u16/u32/u64/s8/s16/s32/s64), hexadecimal integers (x/x8/x16/x32/x64), signedness casting (u/s), "string" and bitfield are supported. (see TYPES for detail)
 On x86 systems %REG is always the short form of the register: for example %AX. %RAX or %EAX is not valid.
 
 TYPES
 -----
-Basic types (u8/u16/u32/u64/s8/s16/s32/s64) are integer types. Prefix 's' and 'u' means those types are signed and unsigned respectively. Traced arguments are shown in decimal (signed) or hex (unsigned). You can also use 's' or 'u' to specify only signedness and leave its size auto-detected by perf probe.
+Basic types (u8/u16/u32/u64/s8/s16/s32/s64) and hexadecimal integers (x8/x16/x32/x64) are integer types. Prefix 's' and 'u' means those types are signed and unsigned respectively, and 'x' means that is shown in hexadecimal format. Traced arguments are shown in decimal (sNN/uNN) or hex (xNN). You can also use 's' or 'u' to specify only signedness and leave its size auto-detected by perf probe. Moreover, you can use 'x' to explicitly specify to be shown in hexadecimal (the size is also auto-detected).
 String type is a special type, which fetches a "null-terminated" string from kernel space. This means it will fail and store NULL if the string container has been paged out. You can specify 'string' type only for the local variable or structure member which is an array of or a pointer to 'char' or 'unsigned char' type.
 Bitfield is another special type, which takes 3 parameters, bit-width, bit-offset, and container-size (usually 32). The syntax is;
 
index 1d8d5bc4cd2de6601f926696d5c172b30fa735d4..2b477c1d1efe60b280394d8b407f9a18c3544dd6 100644 (file)
        use_offset = true
        jump_arrows = true
        show_nr_jumps = false
+
+[report]
+
+       # Defaults
+       sort-order = comm,dso,symbol
+       percent-limit = 0
+       queue-size = 0
+       children = true
+       group = true
index ad2534df4ba6ce988429510b5c35360c253299cf..ff200c6cb79020b345e6b179bd0dc9eaae25918e 100644 (file)
@@ -60,6 +60,7 @@ tools/include/asm-generic/bitops.h
 tools/include/linux/atomic.h
 tools/include/linux/bitops.h
 tools/include/linux/compiler.h
+tools/include/linux/coresight-pmu.h
 tools/include/linux/filter.h
 tools/include/linux/hash.h
 tools/include/linux/kernel.h
@@ -77,4 +78,5 @@ tools/include/linux/stringify.h
 tools/include/linux/types.h
 tools/include/linux/err.h
 tools/include/linux/bitmap.h
+tools/include/linux/time64.h
 tools/arch/*/include/uapi/asm/perf_regs.h
index 2d908750163368e9d1ac7af4177c6717ce556300..828cfd766e31892e8b1cf5543bfcac5b33b19df8 100644 (file)
@@ -165,7 +165,7 @@ SUBCMD_DIR  = $(srctree)/tools/lib/subcmd/
 # non-config cases
 config := 1
 
-NON_CONFIG_TARGETS := clean TAGS tags cscope help
+NON_CONFIG_TARGETS := clean TAGS tags cscope help install-doc
 
 ifdef MAKECMDGOALS
 ifeq ($(filter-out $(NON_CONFIG_TARGETS),$(MAKECMDGOALS)),)
@@ -429,6 +429,9 @@ $(PERF_IN): prepare FORCE
        @(test -f ../../include/asm-generic/bitops/fls64.h && ( \
         (diff -B ../include/asm-generic/bitops/fls64.h ../../include/asm-generic/bitops/fls64.h >/dev/null) \
         || echo "Warning: tools/include/asm-generic/bitops/fls64.h differs from kernel" >&2 )) || true
+       @(test -f ../../include/linux/coresight-pmu.h && ( \
+       (diff -B ../include/linux/coresight-pmu.h ../../include/linux/coresight-pmu.h >/dev/null) \
+       || echo "Warning: tools/include/linux/coresight-pmu.h differs from kernel" >&2 )) || true
        $(Q)$(MAKE) $(build)=perf
 
 $(OUTPUT)perf: $(PERFLIBS) $(PERF_IN) $(LIBTRACEEVENT_DYNAMIC_LIST)
diff --git a/tools/perf/arch/arm/include/dwarf-regs-table.h b/tools/perf/arch/arm/include/dwarf-regs-table.h
new file mode 100644 (file)
index 0000000..f298d03
--- /dev/null
@@ -0,0 +1,9 @@
+#ifdef DEFINE_DWARF_REGSTR_TABLE
+/* This is included in perf/util/dwarf-regs.c */
+
+static const char * const arm_regstr_tbl[] = {
+       "%r0", "%r1", "%r2", "%r3", "%r4",
+       "%r5", "%r6", "%r7", "%r8", "%r9", "%r10",
+       "%fp", "%ip", "%sp", "%lr", "%pc",
+};
+#endif
diff --git a/tools/perf/arch/arm64/include/dwarf-regs-table.h b/tools/perf/arch/arm64/include/dwarf-regs-table.h
new file mode 100644 (file)
index 0000000..2675936
--- /dev/null
@@ -0,0 +1,13 @@
+#ifdef DEFINE_DWARF_REGSTR_TABLE
+/* This is included in perf/util/dwarf-regs.c */
+
+static const char * const aarch64_regstr_tbl[] = {
+       "%r0", "%r1", "%r2", "%r3", "%r4",
+       "%r5", "%r6", "%r7", "%r8", "%r9",
+       "%r10", "%r11", "%r12", "%r13", "%r14",
+       "%r15", "%r16", "%r17", "%r18", "%r19",
+       "%r20", "%r21", "%r22", "%r23", "%r24",
+       "%r25", "%r26", "%r27", "%r28", "%r29",
+       "%lr", "%sp",
+};
+#endif
diff --git a/tools/perf/arch/powerpc/include/dwarf-regs-table.h b/tools/perf/arch/powerpc/include/dwarf-regs-table.h
new file mode 100644 (file)
index 0000000..db4730f
--- /dev/null
@@ -0,0 +1,27 @@
+#ifdef DEFINE_DWARF_REGSTR_TABLE
+/* This is included in perf/util/dwarf-regs.c */
+
+/*
+ * Reference:
+ * http://refspecs.linuxfoundation.org/ELF/ppc64/PPC-elf64abi-1.9.html
+ * http://refspecs.linux-foundation.org/elf/elfspec_ppc.pdf
+ */
+#define REG_DWARFNUM_NAME(reg, idx)    [idx] = "%" #reg
+
+static const char * const powerpc_regstr_tbl[] = {
+       "%gpr0", "%gpr1", "%gpr2", "%gpr3", "%gpr4",
+       "%gpr5", "%gpr6", "%gpr7", "%gpr8", "%gpr9",
+       "%gpr10", "%gpr11", "%gpr12", "%gpr13", "%gpr14",
+       "%gpr15", "%gpr16", "%gpr17", "%gpr18", "%gpr19",
+       "%gpr20", "%gpr21", "%gpr22", "%gpr23", "%gpr24",
+       "%gpr25", "%gpr26", "%gpr27", "%gpr28", "%gpr29",
+       "%gpr30", "%gpr31",
+       REG_DWARFNUM_NAME(msr,   66),
+       REG_DWARFNUM_NAME(ctr,   109),
+       REG_DWARFNUM_NAME(link,  108),
+       REG_DWARFNUM_NAME(xer,   101),
+       REG_DWARFNUM_NAME(dar,   119),
+       REG_DWARFNUM_NAME(dsisr, 118),
+};
+
+#endif
index 35745a733100e70f27c6c95ce97ce4ef5a6fabfa..ed9d5d15d5b69223b7fd915cae1a982d53b73e4b 100644 (file)
@@ -108,7 +108,7 @@ void arch__post_process_probe_trace_events(struct perf_probe_event *pev,
        int i = 0;
 
        map = get_target_map(pev->target, pev->uprobes);
-       if (!map || map__load(map, NULL) < 0)
+       if (!map || map__load(map) < 0)
                return;
 
        for (i = 0; i < ntevs; i++) {
diff --git a/tools/perf/arch/s390/include/dwarf-regs-table.h b/tools/perf/arch/s390/include/dwarf-regs-table.h
new file mode 100644 (file)
index 0000000..9da74a9
--- /dev/null
@@ -0,0 +1,8 @@
+#ifdef DEFINE_DWARF_REGSTR_TABLE
+/* This is included in perf/util/dwarf-regs.c */
+
+static const char * const s390_regstr_tbl[] = {
+       "%r0", "%r1",  "%r2",  "%r3",  "%r4",  "%r5",  "%r6",  "%r7",
+       "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15",
+};
+#endif
diff --git a/tools/perf/arch/sh/include/dwarf-regs-table.h b/tools/perf/arch/sh/include/dwarf-regs-table.h
new file mode 100644 (file)
index 0000000..3a2deaf
--- /dev/null
@@ -0,0 +1,25 @@
+#ifdef DEFINE_DWARF_REGSTR_TABLE
+/* This is included in perf/util/dwarf-regs.c */
+
+const char * const sh_regstr_tbl[] = {
+       "r0",
+       "r1",
+       "r2",
+       "r3",
+       "r4",
+       "r5",
+       "r6",
+       "r7",
+       "r8",
+       "r9",
+       "r10",
+       "r11",
+       "r12",
+       "r13",
+       "r14",
+       "r15",
+       "pc",
+       "pr",
+};
+
+#endif
diff --git a/tools/perf/arch/sparc/include/dwarf-regs-table.h b/tools/perf/arch/sparc/include/dwarf-regs-table.h
new file mode 100644 (file)
index 0000000..12c0761
--- /dev/null
@@ -0,0 +1,18 @@
+#ifdef DEFINE_DWARF_REGSTR_TABLE
+/* This is included in perf/util/dwarf-regs.c */
+
+static const char * const sparc_regstr_tbl[] = {
+       "%g0", "%g1", "%g2", "%g3", "%g4", "%g5", "%g6", "%g7",
+       "%o0", "%o1", "%o2", "%o3", "%o4", "%o5", "%sp", "%o7",
+       "%l0", "%l1", "%l2", "%l3", "%l4", "%l5", "%l6", "%l7",
+       "%i0", "%i1", "%i2", "%i3", "%i4", "%i5", "%fp", "%i7",
+       "%f0", "%f1", "%f2", "%f3", "%f4", "%f5", "%f6", "%f7",
+       "%f8", "%f9", "%f10", "%f11", "%f12", "%f13", "%f14", "%f15",
+       "%f16", "%f17", "%f18", "%f19", "%f20", "%f21", "%f22", "%f23",
+       "%f24", "%f25", "%f26", "%f27", "%f28", "%f29", "%f30", "%f31",
+       "%f32", "%f33", "%f34", "%f35", "%f36", "%f37", "%f38", "%f39",
+       "%f40", "%f41", "%f42", "%f43", "%f44", "%f45", "%f46", "%f47",
+       "%f48", "%f49", "%f50", "%f51", "%f52", "%f53", "%f54", "%f55",
+       "%f56", "%f57", "%f58", "%f59", "%f60", "%f61", "%f62", "%f63",
+};
+#endif
diff --git a/tools/perf/arch/x86/include/dwarf-regs-table.h b/tools/perf/arch/x86/include/dwarf-regs-table.h
new file mode 100644 (file)
index 0000000..39ac7cb
--- /dev/null
@@ -0,0 +1,14 @@
+#ifdef DEFINE_DWARF_REGSTR_TABLE
+/* This is included in perf/util/dwarf-regs.c */
+
+static const char * const x86_32_regstr_tbl[] = {
+       "%ax", "%cx", "%dx", "%bx", "$stack",/* Stack address instead of %sp */
+       "%bp", "%si", "%di",
+};
+
+static const char * const x86_64_regstr_tbl[] = {
+       "%ax", "dx", "%cx", "%bx", "%si", "%di",
+       "%bp", "%sp", "%r8", "%r9", "%r10", "%r11",
+       "%r12", "%r13", "%r14", "%r15",
+};
+#endif
diff --git a/tools/perf/arch/xtensa/include/dwarf-regs-table.h b/tools/perf/arch/xtensa/include/dwarf-regs-table.h
new file mode 100644 (file)
index 0000000..aa0444a
--- /dev/null
@@ -0,0 +1,8 @@
+#ifdef DEFINE_DWARF_REGSTR_TABLE
+/* This is included in perf/util/dwarf-regs.c */
+
+static const char * const xtensa_regstr_tbl[] = {
+       "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7",
+       "a8", "a9", "a10", "a11", "a12", "a13", "a14", "a15",
+};
+#endif
index f96e22ed9f873de6aada5bb9a3e5ac0fdc902b35..2b9705a8734cd6005fd3149943e2ff20eecf4bbe 100644 (file)
@@ -16,6 +16,7 @@
 #include <subcmd/parse-options.h>
 #include <linux/compiler.h>
 #include <linux/kernel.h>
+#include <linux/time64.h>
 #include <errno.h>
 #include "bench.h"
 #include "futex.h"
@@ -62,7 +63,7 @@ static void print_summary(void)
        printf("Requeued %d of %d threads in %.4f ms (+-%.2f%%)\n",
               requeued_avg,
               nthreads,
-              requeuetime_avg/1e3,
+              requeuetime_avg / USEC_PER_MSEC,
               rel_stddev_stats(requeuetime_stddev, requeuetime_avg));
 }
 
@@ -184,7 +185,7 @@ int bench_futex_requeue(int argc, const char **argv,
 
                if (!silent) {
                        printf("[Run %d]: Requeued %d of %d threads in %.4f ms\n",
-                              j + 1, nrequeued, nthreads, runtime.tv_usec/1e3);
+                              j + 1, nrequeued, nthreads, runtime.tv_usec / (double)USEC_PER_MSEC);
                }
 
                /* everybody should be blocked on futex2, wake'em up */
index 4a2ecd7438ca0ffeb3f9e354e1a24668f06db338..2c8fa67ad53767e43c7e91dc84279528268fcb3a 100644 (file)
@@ -15,6 +15,7 @@
 #include <subcmd/parse-options.h>
 #include <linux/compiler.h>
 #include <linux/kernel.h>
+#include <linux/time64.h>
 #include <errno.h>
 #include "bench.h"
 #include "futex.h"
@@ -156,7 +157,7 @@ static void print_run(struct thread_data *waking_worker, unsigned int run_num)
 
        printf("[Run %d]: Avg per-thread latency (waking %d/%d threads) "
               "in %.4f ms (+-%.2f%%)\n", run_num + 1, wakeup_avg,
-              nblocked_threads, waketime_avg/1e3,
+              nblocked_threads, waketime_avg / USEC_PER_MSEC,
               rel_stddev_stats(waketime_stddev, waketime_avg));
 }
 
@@ -172,7 +173,7 @@ static void print_summary(void)
        printf("Avg per-thread latency (waking %d/%d threads) in %.4f ms (+-%.2f%%)\n",
               wakeup_avg,
               nblocked_threads,
-              waketime_avg/1e3,
+              waketime_avg / USEC_PER_MSEC,
               rel_stddev_stats(waketime_stddev, waketime_avg));
 }
 
index 87d8f4f292d95bfa715c8c2408a69a712c33da78..e246b1b8388a3fd3bfc7c9c398a90efc26d86c79 100644 (file)
@@ -16,6 +16,7 @@
 #include <subcmd/parse-options.h>
 #include <linux/compiler.h>
 #include <linux/kernel.h>
+#include <linux/time64.h>
 #include <errno.h>
 #include "bench.h"
 #include "futex.h"
@@ -81,7 +82,7 @@ static void print_summary(void)
        printf("Wokeup %d of %d threads in %.4f ms (+-%.2f%%)\n",
               wakeup_avg,
               nthreads,
-              waketime_avg/1e3,
+              waketime_avg / USEC_PER_MSEC,
               rel_stddev_stats(waketime_stddev, waketime_avg));
 }
 
@@ -182,7 +183,7 @@ int bench_futex_wake(int argc, const char **argv,
 
                if (!silent) {
                        printf("[Run %d]: Wokeup %d of %d threads in %.4f ms\n",
-                              j + 1, nwoken, nthreads, runtime.tv_usec/1e3);
+                              j + 1, nwoken, nthreads, runtime.tv_usec / (double)USEC_PER_MSEC);
                }
 
                for (i = 0; i < nthreads; i++) {
index 2b54d0f2672a39eaee68c5b5a48de2c01ba2956f..c684910e5a482af6bee4ebd15c86708813ff6777 100644 (file)
@@ -21,6 +21,7 @@
 #include <string.h>
 #include <sys/time.h>
 #include <errno.h>
+#include <linux/time64.h>
 
 #define K 1024
 
@@ -89,7 +90,7 @@ static u64 get_cycles(void)
 
 static double timeval2double(struct timeval *ts)
 {
-       return (double)ts->tv_sec + (double)ts->tv_usec / (double)1000000;
+       return (double)ts->tv_sec + (double)ts->tv_usec / (double)USEC_PER_SEC;
 }
 
 #define print_bps(x) do {                                              \
index f7f530081aa9421a0e01ae362fa02c3192c2896c..8efe904e486bf98b63e24b7904652f10ca5ec7e9 100644 (file)
@@ -30,6 +30,7 @@
 #include <sys/wait.h>
 #include <sys/prctl.h>
 #include <sys/types.h>
+#include <linux/time64.h>
 
 #include <numa.h>
 #include <numaif.h>
@@ -1004,7 +1005,7 @@ static void calc_convergence(double runtime_ns_max, double *convergence)
        if (strong && process_groups == g->p.nr_proc) {
                if (!*convergence) {
                        *convergence = runtime_ns_max;
-                       tprintf(" (%6.1fs converged)\n", *convergence/1e9);
+                       tprintf(" (%6.1fs converged)\n", *convergence / NSEC_PER_SEC);
                        if (g->p.measure_convergence) {
                                g->all_converged = true;
                                g->stop_work = true;
@@ -1012,7 +1013,7 @@ static void calc_convergence(double runtime_ns_max, double *convergence)
                }
        } else {
                if (*convergence) {
-                       tprintf(" (%6.1fs de-converged)", runtime_ns_max/1e9);
+                       tprintf(" (%6.1fs de-converged)", runtime_ns_max / NSEC_PER_SEC);
                        *convergence = 0;
                }
                tprintf("\n");
@@ -1022,7 +1023,7 @@ static void calc_convergence(double runtime_ns_max, double *convergence)
 static void show_summary(double runtime_ns_max, int l, double *convergence)
 {
        tprintf("\r #  %5.1f%%  [%.1f mins]",
-               (double)(l+1)/g->p.nr_loops*100.0, runtime_ns_max/1e9 / 60.0);
+               (double)(l+1)/g->p.nr_loops*100.0, runtime_ns_max / NSEC_PER_SEC / 60.0);
 
        calc_convergence(runtime_ns_max, convergence);
 
@@ -1179,8 +1180,8 @@ static void *worker_thread(void *__tdata)
 
                if (details >= 3) {
                        timersub(&stop, &start, &diff);
-                       runtime_ns_max = diff.tv_sec * 1000000000;
-                       runtime_ns_max += diff.tv_usec * 1000;
+                       runtime_ns_max = diff.tv_sec * NSEC_PER_SEC;
+                       runtime_ns_max += diff.tv_usec * NSEC_PER_USEC;
 
                        if (details >= 0) {
                                printf(" #%2d / %2d: %14.2lf nsecs/op [val: %016"PRIx64"]\n",
@@ -1192,23 +1193,23 @@ static void *worker_thread(void *__tdata)
                        continue;
 
                timersub(&stop, &start0, &diff);
-               runtime_ns_max = diff.tv_sec * 1000000000ULL;
-               runtime_ns_max += diff.tv_usec * 1000ULL;
+               runtime_ns_max = diff.tv_sec * NSEC_PER_SEC;
+               runtime_ns_max += diff.tv_usec * NSEC_PER_USEC;
 
                show_summary(runtime_ns_max, l, &convergence);
        }
 
        gettimeofday(&stop, NULL);
        timersub(&stop, &start0, &diff);
-       td->runtime_ns = diff.tv_sec * 1000000000ULL;
-       td->runtime_ns += diff.tv_usec * 1000ULL;
-       td->speed_gbs = bytes_done / (td->runtime_ns / 1e9) / 1e9;
+       td->runtime_ns = diff.tv_sec * NSEC_PER_SEC;
+       td->runtime_ns += diff.tv_usec * NSEC_PER_USEC;
+       td->speed_gbs = bytes_done / (td->runtime_ns / NSEC_PER_SEC) / 1e9;
 
        getrusage(RUSAGE_THREAD, &rusage);
-       td->system_time_ns = rusage.ru_stime.tv_sec * 1000000000ULL;
-       td->system_time_ns += rusage.ru_stime.tv_usec * 1000ULL;
-       td->user_time_ns = rusage.ru_utime.tv_sec * 1000000000ULL;
-       td->user_time_ns += rusage.ru_utime.tv_usec * 1000ULL;
+       td->system_time_ns = rusage.ru_stime.tv_sec * NSEC_PER_SEC;
+       td->system_time_ns += rusage.ru_stime.tv_usec * NSEC_PER_USEC;
+       td->user_time_ns = rusage.ru_utime.tv_sec * NSEC_PER_SEC;
+       td->user_time_ns += rusage.ru_utime.tv_usec * NSEC_PER_USEC;
 
        free_data(thread_data, g->p.bytes_thread);
 
@@ -1469,7 +1470,7 @@ static int __bench_numa(const char *name)
        }
        /* Wait for all the threads to start up: */
        while (g->nr_tasks_started != g->p.nr_tasks)
-               usleep(1000);
+               usleep(USEC_PER_MSEC);
 
        BUG_ON(g->nr_tasks_started != g->p.nr_tasks);
 
@@ -1488,9 +1489,9 @@ static int __bench_numa(const char *name)
 
                timersub(&stop, &start, &diff);
 
-               startup_sec = diff.tv_sec * 1000000000.0;
-               startup_sec += diff.tv_usec * 1000.0;
-               startup_sec /= 1e9;
+               startup_sec = diff.tv_sec * NSEC_PER_SEC;
+               startup_sec += diff.tv_usec * NSEC_PER_USEC;
+               startup_sec /= NSEC_PER_SEC;
 
                tprintf(" threads initialized in %.6f seconds.\n", startup_sec);
                tprintf(" #\n");
@@ -1529,14 +1530,14 @@ static int __bench_numa(const char *name)
        tprintf("\n ###\n");
        tprintf("\n");
 
-       runtime_sec_max = diff.tv_sec * 1000000000.0;
-       runtime_sec_max += diff.tv_usec * 1000.0;
-       runtime_sec_max /= 1e9;
+       runtime_sec_max = diff.tv_sec * NSEC_PER_SEC;
+       runtime_sec_max += diff.tv_usec * NSEC_PER_USEC;
+       runtime_sec_max /= NSEC_PER_SEC;
 
-       runtime_sec_min = runtime_ns_min/1e9;
+       runtime_sec_min = runtime_ns_min / NSEC_PER_SEC;
 
        bytes = g->bytes_done;
-       runtime_avg = (double)runtime_ns_sum / g->p.nr_tasks / 1e9;
+       runtime_avg = (double)runtime_ns_sum / g->p.nr_tasks / NSEC_PER_SEC;
 
        if (g->p.measure_convergence) {
                print_res(name, runtime_sec_max,
@@ -1562,7 +1563,7 @@ static int __bench_numa(const char *name)
        print_res(name, bytes / 1e9,
                "GB,", "data-total",            "GB data processed, total");
 
-       print_res(name, runtime_sec_max * 1e9 / (bytes / g->p.nr_tasks),
+       print_res(name, runtime_sec_max * NSEC_PER_SEC / (bytes / g->p.nr_tasks),
                "nsecs,", "runtime/byte/thread","nsecs/byte/thread runtime");
 
        print_res(name, bytes / g->p.nr_tasks / 1e9 / runtime_sec_max,
@@ -1581,9 +1582,9 @@ static int __bench_numa(const char *name)
                                snprintf(tname, 32, "process%d:thread%d", p, t);
                                print_res(tname, td->speed_gbs,
                                        "GB/sec",       "thread-speed", "GB/sec/thread speed");
-                               print_res(tname, td->system_time_ns / 1e9,
+                               print_res(tname, td->system_time_ns / NSEC_PER_SEC,
                                        "secs", "thread-system-time", "system CPU time/thread");
-                               print_res(tname, td->user_time_ns / 1e9,
+                               print_res(tname, td->user_time_ns / NSEC_PER_SEC,
                                        "secs", "thread-user-time", "user CPU time/thread");
                        }
                }
index bfaf9503de8ef4d96a85770afb4f658c11e7d2e2..6a111e775210f700de6e3b796b703675748a0b05 100644 (file)
@@ -29,6 +29,7 @@
 #include <poll.h>
 #include <limits.h>
 #include <err.h>
+#include <linux/time64.h>
 
 #define DATASIZE 100
 
@@ -312,11 +313,11 @@ int bench_sched_messaging(int argc, const char **argv,
                       thread_mode ? "threads" : "processes");
                printf(" %14s: %lu.%03lu [sec]\n", "Total time",
                       diff.tv_sec,
-                      (unsigned long) (diff.tv_usec/1000));
+                      (unsigned long) (diff.tv_usec / USEC_PER_MSEC));
                break;
        case BENCH_FORMAT_SIMPLE:
                printf("%lu.%03lu\n", diff.tv_sec,
-                      (unsigned long) (diff.tv_usec/1000));
+                      (unsigned long) (diff.tv_usec / USEC_PER_MSEC));
                break;
        default:
                /* reaching here is something disaster */
index 1dc2d13cc2722c7c0207edc4fafb1b7f77b7697d..2243f0150d76452da3502c0d5f3ead12b85de512 100644 (file)
@@ -25,6 +25,7 @@
 #include <sys/time.h>
 #include <sys/types.h>
 #include <sys/syscall.h>
+#include <linux/time64.h>
 
 #include <pthread.h>
 
@@ -153,24 +154,24 @@ int bench_sched_pipe(int argc, const char **argv, const char *prefix __maybe_unu
                printf("# Executed %d pipe operations between two %s\n\n",
                        loops, threaded ? "threads" : "processes");
 
-               result_usec = diff.tv_sec * 1000000;
+               result_usec = diff.tv_sec * USEC_PER_SEC;
                result_usec += diff.tv_usec;
 
                printf(" %14s: %lu.%03lu [sec]\n\n", "Total time",
                       diff.tv_sec,
-                      (unsigned long) (diff.tv_usec/1000));
+                      (unsigned long) (diff.tv_usec / USEC_PER_MSEC));
 
                printf(" %14lf usecs/op\n",
                       (double)result_usec / (double)loops);
                printf(" %14d ops/sec\n",
                       (int)((double)loops /
-                            ((double)result_usec / (double)1000000)));
+                            ((double)result_usec / (double)USEC_PER_SEC)));
                break;
 
        case BENCH_FORMAT_SIMPLE:
                printf("%lu.%03lu\n",
                       diff.tv_sec,
-                      (unsigned long) (diff.tv_usec / 1000));
+                      (unsigned long) (diff.tv_usec / USEC_PER_MSEC));
                break;
 
        default:
index 9c1034d81b4fe3cc72d4b3b09ac8d527d07b0da0..ebb628332a6e59a938347eca53c3da81c859aee5 100644 (file)
@@ -30,6 +30,7 @@
 #include "util/tool.h"
 #include "util/data.h"
 #include "arch/common.h"
+#include "util/block-range.h"
 
 #include <dlfcn.h>
 #include <linux/bitmap.h>
@@ -46,6 +47,103 @@ struct perf_annotate {
        DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS);
 };
 
+/*
+ * Given one basic block:
+ *
+ *     from    to              branch_i
+ *     * ----> *
+ *             |
+ *             | block
+ *             v
+ *             * ----> *
+ *             from    to      branch_i+1
+ *
+ * where the horizontal are the branches and the vertical is the executed
+ * block of instructions.
+ *
+ * We count, for each 'instruction', the number of blocks that covered it as
+ * well as count the ratio each branch is taken.
+ *
+ * We can do this without knowing the actual instruction stream by keeping
+ * track of the address ranges. We break down ranges such that there is no
+ * overlap and iterate from the start until the end.
+ *
+ * @acme: once we parse the objdump output _before_ processing the samples,
+ * we can easily fold the branch.cycles IPC bits in.
+ */
+static void process_basic_block(struct addr_map_symbol *start,
+                               struct addr_map_symbol *end,
+                               struct branch_flags *flags)
+{
+       struct symbol *sym = start->sym;
+       struct annotation *notes = sym ? symbol__annotation(sym) : NULL;
+       struct block_range_iter iter;
+       struct block_range *entry;
+
+       /*
+        * Sanity; NULL isn't executable and the CPU cannot execute backwards
+        */
+       if (!start->addr || start->addr > end->addr)
+               return;
+
+       iter = block_range__create(start->addr, end->addr);
+       if (!block_range_iter__valid(&iter))
+               return;
+
+       /*
+        * First block in range is a branch target.
+        */
+       entry = block_range_iter(&iter);
+       assert(entry->is_target);
+       entry->entry++;
+
+       do {
+               entry = block_range_iter(&iter);
+
+               entry->coverage++;
+               entry->sym = sym;
+
+               if (notes)
+                       notes->max_coverage = max(notes->max_coverage, entry->coverage);
+
+       } while (block_range_iter__next(&iter));
+
+       /*
+        * Last block in rage is a branch.
+        */
+       entry = block_range_iter(&iter);
+       assert(entry->is_branch);
+       entry->taken++;
+       if (flags->predicted)
+               entry->pred++;
+}
+
+static void process_branch_stack(struct branch_stack *bs, struct addr_location *al,
+                                struct perf_sample *sample)
+{
+       struct addr_map_symbol *prev = NULL;
+       struct branch_info *bi;
+       int i;
+
+       if (!bs || !bs->nr)
+               return;
+
+       bi = sample__resolve_bstack(sample, al);
+       if (!bi)
+               return;
+
+       for (i = bs->nr - 1; i >= 0; i--) {
+               /*
+                * XXX filter against symbol
+                */
+               if (prev)
+                       process_basic_block(prev, &bi[i].from, &bi[i].flags);
+               prev = &bi[i].to;
+       }
+
+       free(bi);
+}
+
 static int perf_evsel__add_sample(struct perf_evsel *evsel,
                                  struct perf_sample *sample,
                                  struct addr_location *al,
@@ -72,6 +170,12 @@ static int perf_evsel__add_sample(struct perf_evsel *evsel,
                return 0;
        }
 
+       /*
+        * XXX filtered samples can still have branch entires pointing into our
+        * symbol and are missed.
+        */
+       process_branch_stack(sample->branch_stack, al, sample);
+
        sample->period = 1;
        sample->weight = 1;
 
@@ -204,8 +308,6 @@ static int __cmd_annotate(struct perf_annotate *ann)
        struct perf_evsel *pos;
        u64 total_nr_samples;
 
-       machines__set_symbol_filter(&session->machines, symbol__annotate_init);
-
        if (ann->cpu_list) {
                ret = perf_session__cpu_bitmap(session, ann->cpu_list,
                                               ann->cpu_bitmap);
@@ -367,7 +469,10 @@ int cmd_annotate(int argc, const char **argv, const char *prefix __maybe_unused)
        if (annotate.session == NULL)
                return -1;
 
-       symbol_conf.priv_size = sizeof(struct annotation);
+       ret = symbol__annotation_init();
+       if (ret < 0)
+               goto out_delete;
+
        symbol_conf.try_vmlinux_path = true;
 
        ret = symbol__init(&annotate.session->header.env);
index 21ee753211adfa954b9e25fb1df9be8354dfc8a5..9ff0db4e2d0cd1bcc35d2ea137871093ebfcb405 100644 (file)
@@ -1033,7 +1033,9 @@ static int hpp__entry_global(struct perf_hpp_fmt *_fmt, struct perf_hpp *hpp,
 }
 
 static int hpp__header(struct perf_hpp_fmt *fmt, struct perf_hpp *hpp,
-                      struct hists *hists __maybe_unused)
+                      struct hists *hists __maybe_unused,
+                      int line __maybe_unused,
+                      int *span __maybe_unused)
 {
        struct diff_hpp_fmt *dfmt =
                container_of(fmt, struct diff_hpp_fmt, fmt);
index 73c1c4cc36009d79a3f33c80db6e4834dc21b81a..b9bc7e39833a47055608feffa1f5917aada33174 100644 (file)
@@ -429,7 +429,7 @@ static int perf_event__inject_buildid(struct perf_tool *tool,
        if (al.map != NULL) {
                if (!al.map->dso->hit) {
                        al.map->dso->hit = 1;
-                       if (map__load(al.map, NULL) >= 0) {
+                       if (map__load(al.map) >= 0) {
                                dso__inject_build_id(al.map->dso, tool, machine);
                                /*
                                 * If this fails, too bad, let the other side
index fdde1bd3e3062bbf266ef165eb4dce8512f4d9b3..d426dcb18ce9a0d9756c274844bc84af9bcc5b0a 100644 (file)
@@ -330,7 +330,7 @@ static int build_alloc_func_list(void)
        }
 
        kernel_map = machine__kernel_map(machine);
-       if (map__load(kernel_map, NULL) < 0) {
+       if (map__load(kernel_map) < 0) {
                pr_err("cannot load kernel map\n");
                return -ENOENT;
        }
@@ -979,7 +979,7 @@ static void __print_slab_result(struct rb_root *root,
                if (is_caller) {
                        addr = data->call_site;
                        if (!raw_ip)
-                               sym = machine__find_kernel_function(machine, addr, &map, NULL);
+                               sym = machine__find_kernel_function(machine, addr, &map);
                } else
                        addr = data->ptr;
 
@@ -1043,8 +1043,7 @@ static void __print_page_alloc_result(struct perf_session *session, int n_lines)
                char *caller = buf;
 
                data = rb_entry(next, struct page_stat, node);
-               sym = machine__find_kernel_function(machine, data->callsite,
-                                                   &map, NULL);
+               sym = machine__find_kernel_function(machine, data->callsite, &map);
                if (sym && sym->name)
                        caller = sym->name;
                else
@@ -1086,8 +1085,7 @@ static void __print_page_caller_result(struct perf_session *session, int n_lines
                char *caller = buf;
 
                data = rb_entry(next, struct page_stat, node);
-               sym = machine__find_kernel_function(machine, data->callsite,
-                                                   &map, NULL);
+               sym = machine__find_kernel_function(machine, data->callsite, &map);
                if (sym && sym->name)
                        caller = sym->name;
                else
index 5e2127e04f8386c4aa92b62e0370afff08d9e9b5..08fa88f62a246f4f309410da45379aa21949f14e 100644 (file)
@@ -24,6 +24,7 @@
 #include <sys/timerfd.h>
 #endif
 
+#include <linux/time64.h>
 #include <termios.h>
 #include <semaphore.h>
 #include <pthread.h>
@@ -362,7 +363,7 @@ static bool handle_end_event(struct perf_kvm_stat *kvm,
                if (!skip_event(decode)) {
                        pr_info("%" PRIu64 " VM %d, vcpu %d: %s event took %" PRIu64 "usec\n",
                                 sample->time, sample->pid, vcpu_record->vcpu_id,
-                                decode, time_diff/1000);
+                                decode, time_diff / NSEC_PER_USEC);
                }
        }
 
@@ -608,15 +609,15 @@ static void print_result(struct perf_kvm_stat *kvm)
                pr_info("%10llu ", (unsigned long long)ecount);
                pr_info("%8.2f%% ", (double)ecount / kvm->total_count * 100);
                pr_info("%8.2f%% ", (double)etime / kvm->total_time * 100);
-               pr_info("%9.2fus ", (double)min / 1e3);
-               pr_info("%9.2fus ", (double)max / 1e3);
-               pr_info("%9.2fus ( +-%7.2f%% )", (double)etime / ecount/1e3,
+               pr_info("%9.2fus ", (double)min / NSEC_PER_USEC);
+               pr_info("%9.2fus ", (double)max / NSEC_PER_USEC);
+               pr_info("%9.2fus ( +-%7.2f%% )", (double)etime / ecount / NSEC_PER_USEC,
                        kvm_event_rel_stddev(vcpu, event));
                pr_info("\n");
        }
 
        pr_info("\nTotal Samples:%" PRIu64 ", Total events handled time:%.2fus.\n\n",
-               kvm->total_count, kvm->total_time / 1e3);
+               kvm->total_count, kvm->total_time / (double)NSEC_PER_USEC);
 
        if (kvm->lost_events)
                pr_info("\nLost events: %" PRIu64 "\n\n", kvm->lost_events);
index ee5b42173ba374e066551f32eba24be8c863aebc..f87996b0cb299636e5803a699710be944d3924e3 100644 (file)
@@ -326,6 +326,11 @@ static int perf_add_probe_events(struct perf_probe_event *pevs, int npevs)
        if (ret < 0)
                goto out_cleanup;
 
+       if (params.command == 'D') {    /* it shows definition */
+               ret = show_probe_trace_events(pevs, npevs);
+               goto out_cleanup;
+       }
+
        ret = apply_perf_probe_events(pevs, npevs);
        if (ret < 0)
                goto out_cleanup;
@@ -454,6 +459,14 @@ out:
        return ret;
 }
 
+#ifdef HAVE_DWARF_SUPPORT
+#define PROBEDEF_STR   \
+       "[EVENT=]FUNC[@SRC][+OFF|%return|:RL|;PT]|SRC:AL|SRC;PT [[NAME=]ARG ...]"
+#else
+#define PROBEDEF_STR   "[EVENT=]FUNC[+OFF|%return] [[NAME=]ARG ...]"
+#endif
+
+
 static int
 __cmd_probe(int argc, const char **argv, const char *prefix __maybe_unused)
 {
@@ -479,13 +492,7 @@ __cmd_probe(int argc, const char **argv, const char *prefix __maybe_unused)
                             opt_set_filter_with_command, DEFAULT_LIST_FILTER),
        OPT_CALLBACK('d', "del", NULL, "[GROUP:]EVENT", "delete a probe event.",
                     opt_set_filter_with_command),
-       OPT_CALLBACK('a', "add", NULL,
-#ifdef HAVE_DWARF_SUPPORT
-               "[EVENT=]FUNC[@SRC][+OFF|%return|:RL|;PT]|SRC:AL|SRC;PT"
-               " [[NAME=]ARG ...]",
-#else
-               "[EVENT=]FUNC[+OFF|%return] [[NAME=]ARG ...]",
-#endif
+       OPT_CALLBACK('a', "add", NULL, PROBEDEF_STR,
                "probe point definition, where\n"
                "\t\tGROUP:\tGroup name (optional)\n"
                "\t\tEVENT:\tEvent name\n"
@@ -503,6 +510,9 @@ __cmd_probe(int argc, const char **argv, const char *prefix __maybe_unused)
                "\t\tARG:\tProbe argument (kprobe-tracer argument format.)\n",
 #endif
                opt_add_probe_event),
+       OPT_CALLBACK('D', "definition", NULL, PROBEDEF_STR,
+               "Show trace event definition of given traceevent for k/uprobe_events.",
+               opt_add_probe_event),
        OPT_BOOLEAN('f', "force", &probe_conf.force_add, "forcibly add events"
                    " with existing name"),
        OPT_CALLBACK('L', "line", NULL,
@@ -548,6 +558,7 @@ __cmd_probe(int argc, const char **argv, const char *prefix __maybe_unused)
 
        set_option_flag(options, 'a', "add", PARSE_OPT_EXCLUSIVE);
        set_option_flag(options, 'd', "del", PARSE_OPT_EXCLUSIVE);
+       set_option_flag(options, 'D', "definition", PARSE_OPT_EXCLUSIVE);
        set_option_flag(options, 'l', "list", PARSE_OPT_EXCLUSIVE);
 #ifdef HAVE_DWARF_SUPPORT
        set_option_flag(options, 'L', "line", PARSE_OPT_EXCLUSIVE);
@@ -600,6 +611,14 @@ __cmd_probe(int argc, const char **argv, const char *prefix __maybe_unused)
         */
        symbol_conf.try_vmlinux_path = (symbol_conf.vmlinux_name == NULL);
 
+       /*
+        * Except for --list, --del and --add, other command doesn't depend
+        * nor change running kernel. So if user gives offline vmlinux,
+        * ignore its buildid.
+        */
+       if (!strchr("lda", params.command) && symbol_conf.vmlinux_name)
+               symbol_conf.ignore_vmlinux_buildid = true;
+
        switch (params.command) {
        case 'l':
                if (params.uprobes) {
@@ -643,7 +662,9 @@ __cmd_probe(int argc, const char **argv, const char *prefix __maybe_unused)
                        return ret;
                }
                break;
+       case 'D':
        case 'a':
+
                /* Ensure the last given target is used */
                if (params.target && !params.target_used) {
                        pr_err("  Error: -x/-m must follow the probe definitions.\n");
index 6355902fbfc8a28137335c49450ca4cf27a21ddb..03251c7f14ecca1dcc2477333f743f6f89ad8dff 100644 (file)
@@ -42,7 +42,7 @@
 #include <sched.h>
 #include <sys/mman.h>
 #include <asm/bug.h>
-
+#include <linux/time64.h>
 
 struct record {
        struct perf_tool        tool;
@@ -96,7 +96,7 @@ backward_rb_find_range(void *buf, int mask, u64 head, u64 *start, u64 *end)
        *start = head;
        while (true) {
                if (evt_head - head >= (unsigned int)size) {
-                       pr_debug("Finshed reading backward ring buffer: rewind\n");
+                       pr_debug("Finished reading backward ring buffer: rewind\n");
                        if (evt_head - head > (unsigned int)size)
                                evt_head -= pheader->size;
                        *end = evt_head;
@@ -106,7 +106,7 @@ backward_rb_find_range(void *buf, int mask, u64 head, u64 *start, u64 *end)
                pheader = (struct perf_event_header *)(buf + (evt_head & mask));
 
                if (pheader->size == 0) {
-                       pr_debug("Finshed reading backward ring buffer: get start\n");
+                       pr_debug("Finished reading backward ring buffer: get start\n");
                        *end = evt_head;
                        return 0;
                }
@@ -954,7 +954,7 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
        }
 
        if (opts->initial_delay) {
-               usleep(opts->initial_delay * 1000);
+               usleep(opts->initial_delay * USEC_PER_MSEC);
                perf_evlist__enable(rec->evlist);
        }
 
index 949e5a15c960e2ef190cc97217df29d561c73777..1a07c4cdf6edf9f7eb590858cc7296c422c3168a 100644 (file)
@@ -89,6 +89,10 @@ static int report__config(const char *var, const char *value, void *cb)
                rep->queue_size = perf_config_u64(var, value);
                return 0;
        }
+       if (!strcmp(var, "report.sort_order")) {
+               default_sort_order = strdup(value);
+               return 0;
+       }
 
        return 0;
 }
@@ -980,9 +984,9 @@ repeat:
         * implementation.
         */
        if (ui__has_annotation()) {
-               symbol_conf.priv_size = sizeof(struct annotation);
-               machines__set_symbol_filter(&session->machines,
-                                           symbol__annotate_init);
+               ret = symbol__annotation_init();
+               if (ret < 0)
+                       goto error;
                /*
                 * For searching by name on the "Browse map details".
                 * providing it only in verbose mode not to bloat too
index 0dfe8df2ab9b237b2530bbb4c81967b9eac6a689..f5503ca22e1c514493567fa0e785073c23de96ee 100644 (file)
@@ -26,6 +26,7 @@
 #include <pthread.h>
 #include <math.h>
 #include <api/fs/fs.h>
+#include <linux/time64.h>
 
 #define PR_SET_NAME            15               /* Set process name */
 #define MAX_CPUS               4096
@@ -199,7 +200,7 @@ static u64 get_nsecs(void)
 
        clock_gettime(CLOCK_MONOTONIC, &ts);
 
-       return ts.tv_sec * 1000000000ULL + ts.tv_nsec;
+       return ts.tv_sec * NSEC_PER_SEC + ts.tv_nsec;
 }
 
 static void burn_nsecs(struct perf_sched *sched, u64 nsecs)
@@ -223,7 +224,7 @@ static void sleep_nsecs(u64 nsecs)
 
 static void calibrate_run_measurement_overhead(struct perf_sched *sched)
 {
-       u64 T0, T1, delta, min_delta = 1000000000ULL;
+       u64 T0, T1, delta, min_delta = NSEC_PER_SEC;
        int i;
 
        for (i = 0; i < 10; i++) {
@@ -240,7 +241,7 @@ static void calibrate_run_measurement_overhead(struct perf_sched *sched)
 
 static void calibrate_sleep_measurement_overhead(struct perf_sched *sched)
 {
-       u64 T0, T1, delta, min_delta = 1000000000ULL;
+       u64 T0, T1, delta, min_delta = NSEC_PER_SEC;
        int i;
 
        for (i = 0; i < 10; i++) {
@@ -452,8 +453,8 @@ static u64 get_cpu_usage_nsec_parent(void)
        err = getrusage(RUSAGE_SELF, &ru);
        BUG_ON(err);
 
-       sum =  ru.ru_utime.tv_sec*1e9 + ru.ru_utime.tv_usec*1e3;
-       sum += ru.ru_stime.tv_sec*1e9 + ru.ru_stime.tv_usec*1e3;
+       sum =  ru.ru_utime.tv_sec * NSEC_PER_SEC + ru.ru_utime.tv_usec * NSEC_PER_USEC;
+       sum += ru.ru_stime.tv_sec * NSEC_PER_SEC + ru.ru_stime.tv_usec * NSEC_PER_USEC;
 
        return sum;
 }
@@ -667,12 +668,12 @@ static void run_one_test(struct perf_sched *sched)
                sched->run_avg = delta;
        sched->run_avg = (sched->run_avg * (sched->replay_repeat - 1) + delta) / sched->replay_repeat;
 
-       printf("#%-3ld: %0.3f, ", sched->nr_runs, (double)delta / 1000000.0);
+       printf("#%-3ld: %0.3f, ", sched->nr_runs, (double)delta / NSEC_PER_MSEC);
 
-       printf("ravg: %0.2f, ", (double)sched->run_avg / 1e6);
+       printf("ravg: %0.2f, ", (double)sched->run_avg / NSEC_PER_MSEC);
 
        printf("cpu: %0.2f / %0.2f",
-               (double)sched->cpu_usage / 1e6, (double)sched->runavg_cpu_usage / 1e6);
+               (double)sched->cpu_usage / NSEC_PER_MSEC, (double)sched->runavg_cpu_usage / NSEC_PER_MSEC);
 
 #if 0
        /*
@@ -680,8 +681,8 @@ static void run_one_test(struct perf_sched *sched)
         * accurate than the sched->sum_exec_runtime based statistics:
         */
        printf(" [%0.2f / %0.2f]",
-               (double)sched->parent_cpu_usage/1e6,
-               (double)sched->runavg_parent_cpu_usage/1e6);
+               (double)sched->parent_cpu_usage / NSEC_PER_MSEC,
+               (double)sched->runavg_parent_cpu_usage / NSEC_PER_MSEC);
 #endif
 
        printf("\n");
@@ -696,13 +697,13 @@ static void test_calibrations(struct perf_sched *sched)
        u64 T0, T1;
 
        T0 = get_nsecs();
-       burn_nsecs(sched, 1e6);
+       burn_nsecs(sched, NSEC_PER_MSEC);
        T1 = get_nsecs();
 
        printf("the run test took %" PRIu64 " nsecs\n", T1 - T0);
 
        T0 = get_nsecs();
-       sleep_nsecs(1e6);
+       sleep_nsecs(NSEC_PER_MSEC);
        T1 = get_nsecs();
 
        printf("the sleep test took %" PRIu64 " nsecs\n", T1 - T0);
@@ -1213,10 +1214,10 @@ static void output_lat_thread(struct perf_sched *sched, struct work_atoms *work_
        avg = work_list->total_lat / work_list->nb_atoms;
 
        printf("|%11.3f ms |%9" PRIu64 " | avg:%9.3f ms | max:%9.3f ms | max at: %13.6f s\n",
-             (double)work_list->total_runtime / 1e6,
-                work_list->nb_atoms, (double)avg / 1e6,
-                (double)work_list->max_lat / 1e6,
-                (double)work_list->max_lat_at / 1e9);
+             (double)work_list->total_runtime / NSEC_PER_MSEC,
+                work_list->nb_atoms, (double)avg / NSEC_PER_MSEC,
+                (double)work_list->max_lat / NSEC_PER_MSEC,
+                (double)work_list->max_lat_at / NSEC_PER_SEC);
 }
 
 static int pid_cmp(struct work_atoms *l, struct work_atoms *r)
@@ -1491,7 +1492,7 @@ static int map_switch_event(struct perf_sched *sched, struct perf_evsel *evsel,
        if (sched->map.cpus && !cpu_map__has(sched->map.cpus, this_cpu))
                goto out;
 
-       color_fprintf(stdout, color, "  %12.6f secs ", (double)timestamp/1e9);
+       color_fprintf(stdout, color, "  %12.6f secs ", (double)timestamp / NSEC_PER_SEC);
        if (new_shortname) {
                const char *pid_color = color;
 
@@ -1753,7 +1754,7 @@ static int perf_sched__lat(struct perf_sched *sched)
 
        printf(" -----------------------------------------------------------------------------------------------------------------\n");
        printf("  TOTAL:                |%11.3f ms |%9" PRIu64 " |\n",
-               (double)sched->all_runtime / 1e6, sched->all_count);
+               (double)sched->all_runtime / NSEC_PER_MSEC, sched->all_count);
 
        printf(" ---------------------------------------------------\n");
 
index c859e59dfe3e7efae711fa056967c910a987d989..7228d141a789d8d50dd4386df0e38e9bba7e9183 100644 (file)
@@ -24,6 +24,7 @@
 #include "util/thread-stack.h"
 #include <linux/bitmap.h>
 #include <linux/stringify.h>
+#include <linux/time64.h>
 #include "asm/bug.h"
 #include "util/mem-events.h"
 
@@ -464,9 +465,9 @@ static void print_sample_start(struct perf_sample *sample,
 
        if (PRINT_FIELD(TIME)) {
                nsecs = sample->time;
-               secs = nsecs / NSECS_PER_SEC;
-               nsecs -= secs * NSECS_PER_SEC;
-               usecs = nsecs / NSECS_PER_USEC;
+               secs = nsecs / NSEC_PER_SEC;
+               nsecs -= secs * NSEC_PER_SEC;
+               usecs = nsecs / NSEC_PER_USEC;
                if (nanosecs)
                        printf("%5lu.%09llu: ", secs, nsecs);
                else
@@ -521,11 +522,11 @@ static void print_sample_brstacksym(struct perf_sample *sample,
 
                thread__find_addr_map(thread, sample->cpumode, MAP__FUNCTION, from, &alf);
                if (alf.map)
-                       alf.sym = map__find_symbol(alf.map, alf.addr, NULL);
+                       alf.sym = map__find_symbol(alf.map, alf.addr);
 
                thread__find_addr_map(thread, sample->cpumode, MAP__FUNCTION, to, &alt);
                if (alt.map)
-                       alt.sym = map__find_symbol(alt.map, alt.addr, NULL);
+                       alt.sym = map__find_symbol(alt.map, alt.addr);
 
                symbol__fprintf_symname_offs(alf.sym, &alf, stdout);
                putchar('/');
index 3c7452b39f57649b05d675db3d19395fb765df2d..90882b1d6a915db8288d5705e88c277e3e8be0cc 100644 (file)
@@ -65,6 +65,7 @@
 #include "util/group.h"
 #include "asm/bug.h"
 
+#include <linux/time64.h>
 #include <api/fs/fs.h>
 #include <stdlib.h>
 #include <sys/prctl.h>
@@ -172,7 +173,7 @@ static inline void diff_timespec(struct timespec *r, struct timespec *a,
 {
        r->tv_sec = a->tv_sec - b->tv_sec;
        if (a->tv_nsec < b->tv_nsec) {
-               r->tv_nsec = a->tv_nsec + 1000000000L - b->tv_nsec;
+               r->tv_nsec = a->tv_nsec + NSEC_PER_SEC - b->tv_nsec;
                r->tv_sec--;
        } else {
                r->tv_nsec = a->tv_nsec - b->tv_nsec ;
@@ -354,7 +355,7 @@ static void process_interval(void)
        diff_timespec(&rs, &ts, &ref_time);
 
        if (STAT_RECORD) {
-               if (WRITE_STAT_ROUND_EVENT(rs.tv_sec * NSECS_PER_SEC + rs.tv_nsec, INTERVAL))
+               if (WRITE_STAT_ROUND_EVENT(rs.tv_sec * NSEC_PER_SEC + rs.tv_nsec, INTERVAL))
                        pr_err("failed to write stat round event\n");
        }
 
@@ -364,7 +365,7 @@ static void process_interval(void)
 static void enable_counters(void)
 {
        if (initial_delay)
-               usleep(initial_delay * 1000);
+               usleep(initial_delay * USEC_PER_MSEC);
 
        /*
         * We need to enable counters only if:
@@ -541,8 +542,8 @@ static int __run_perf_stat(int argc, const char **argv)
        bool is_pipe = STAT_RECORD ? perf_stat.file.is_pipe : false;
 
        if (interval) {
-               ts.tv_sec  = interval / 1000;
-               ts.tv_nsec = (interval % 1000) * 1000000;
+               ts.tv_sec  = interval / USEC_PER_MSEC;
+               ts.tv_nsec = (interval % USEC_PER_MSEC) * NSEC_PER_MSEC;
        } else {
                ts.tv_sec  = 1;
                ts.tv_nsec = 0;
@@ -971,7 +972,7 @@ static void print_metric_header(void *ctx, const char *color __maybe_unused,
 static void nsec_printout(int id, int nr, struct perf_evsel *evsel, double avg)
 {
        FILE *output = stat_config.output;
-       double msecs = avg / 1e6;
+       double msecs = avg / NSEC_PER_MSEC;
        const char *fmt_v, *fmt_n;
        char name[25];
 
@@ -1460,7 +1461,7 @@ static void print_footer(void)
        if (!null_run)
                fprintf(output, "\n");
        fprintf(output, " %17.9f seconds time elapsed",
-                       avg_stats(&walltime_nsecs_stats)/1e9);
+                       avg_stats(&walltime_nsecs_stats) / NSEC_PER_SEC);
        if (run_count > 1) {
                fprintf(output, "                                        ");
                print_noise_pct(stddev_stats(&walltime_nsecs_stats),
@@ -2175,8 +2176,8 @@ static int process_stat_round_event(struct perf_tool *tool __maybe_unused,
                update_stats(&walltime_nsecs_stats, stat_round->time);
 
        if (stat_config.interval && stat_round->time) {
-               tsh.tv_sec  = stat_round->time / NSECS_PER_SEC;
-               tsh.tv_nsec = stat_round->time % NSECS_PER_SEC;
+               tsh.tv_sec  = stat_round->time / NSEC_PER_SEC;
+               tsh.tv_nsec = stat_round->time % NSEC_PER_SEC;
                ts = &tsh;
        }
 
index 733a55422d030037ab1c84bbf4b351c45074cd2c..e7eaa298d34a1ec020a123ade8296451ffe30b73 100644 (file)
@@ -24,6 +24,7 @@
 #include "util/evlist.h"
 #include "util/evsel.h"
 #include <linux/rbtree.h>
+#include <linux/time64.h>
 #include "util/symbol.h"
 #include "util/callchain.h"
 #include "util/strlist.h"
@@ -1288,9 +1289,9 @@ static void draw_process_bars(struct timechart *tchart)
                        if (c->comm) {
                                char comm[256];
                                if (c->total_time > 5000000000) /* 5 seconds */
-                                       sprintf(comm, "%s:%i (%2.2fs)", c->comm, p->pid, c->total_time / 1000000000.0);
+                                       sprintf(comm, "%s:%i (%2.2fs)", c->comm, p->pid, c->total_time / (double)NSEC_PER_SEC);
                                else
-                                       sprintf(comm, "%s:%i (%3.1fms)", c->comm, p->pid, c->total_time / 1000000.0);
+                                       sprintf(comm, "%s:%i (%3.1fms)", c->comm, p->pid, c->total_time / (double)NSEC_PER_MSEC);
 
                                svg_text(Y, c->start_time, comm);
                        }
@@ -1637,7 +1638,7 @@ static int __cmd_timechart(struct timechart *tchart, const char *output_name)
        write_svg_file(tchart, output_name);
 
        pr_info("Written %2.1f seconds of trace to %s.\n",
-               (tchart->last_time - tchart->first_time) / 1000000000.0, output_name);
+               (tchart->last_time - tchart->first_time) / (double)NSEC_PER_SEC, output_name);
 out_delete:
        perf_session__delete(session);
        return ret;
@@ -1901,10 +1902,10 @@ parse_time(const struct option *opt, const char *arg, int __maybe_unused unset)
        if (sscanf(arg, "%" PRIu64 "%cs", value, &unit) > 0) {
                switch (unit) {
                case 'm':
-                       *value *= 1000000;
+                       *value *= NSEC_PER_MSEC;
                        break;
                case 'u':
-                       *value *= 1000;
+                       *value *= NSEC_PER_USEC;
                        break;
                case 'n':
                        break;
@@ -1928,7 +1929,7 @@ int cmd_timechart(int argc, const char **argv,
                        .ordered_events  = true,
                },
                .proc_num = 15,
-               .min_time = 1000000,
+               .min_time = NSEC_PER_MSEC,
                .merge_dist = 1000,
        };
        const char *output_name = "output.svg";
index 418ed94756d357f5101713945ae37b77178acb38..40078570256640c5268fbd5c8d2a450960e95924 100644 (file)
@@ -68,6 +68,7 @@
 #include <sys/mman.h>
 
 #include <linux/stringify.h>
+#include <linux/time64.h>
 #include <linux/types.h>
 
 static volatile int done;
@@ -624,7 +625,7 @@ static void *display_thread(void *arg)
        display_setup_sig();
        pthread__unblock_sigwinch();
 repeat:
-       delay_msecs = top->delay_secs * 1000;
+       delay_msecs = top->delay_secs * MSEC_PER_SEC;
        set_term_quiet_input(&save);
        /* trash return*/
        getc(stdin);
@@ -656,34 +657,6 @@ repeat:
        return NULL;
 }
 
-static int symbol_filter(struct map *map, struct symbol *sym)
-{
-       const char *name = sym->name;
-
-       if (!__map__is_kernel(map))
-               return 0;
-       /*
-        * ppc64 uses function descriptors and appends a '.' to the
-        * start of every instruction address. Remove it.
-        */
-       if (name[0] == '.')
-               name++;
-
-       if (!strcmp(name, "_text") ||
-           !strcmp(name, "_etext") ||
-           !strcmp(name, "_sinittext") ||
-           !strncmp("init_module", name, 11) ||
-           !strncmp("cleanup_module", name, 14) ||
-           strstr(name, "_text_start") ||
-           strstr(name, "_text_end"))
-               return 1;
-
-       if (symbol__is_idle(sym))
-               sym->ignore = true;
-
-       return 0;
-}
-
 static int hist_iter__top_callback(struct hist_entry_iter *iter,
                                   struct addr_location *al, bool single,
                                   void *arg)
@@ -782,7 +755,7 @@ static void perf_event__process_sample(struct perf_tool *tool,
                }
        }
 
-       if (al.sym == NULL || !al.sym->ignore) {
+       if (al.sym == NULL || !al.sym->idle) {
                struct hists *hists = evsel__hists(evsel);
                struct hist_entry_iter iter = {
                        .evsel          = evsel,
@@ -948,8 +921,6 @@ static int __cmd_top(struct perf_top *top)
        if (top->session == NULL)
                return -1;
 
-       machines__set_symbol_filter(&top->session->machines, symbol_filter);
-
        if (!objdump_path) {
                ret = perf_env__lookup_objdump(&top->session->header.env);
                if (ret)
@@ -1323,7 +1294,9 @@ int cmd_top(int argc, const char **argv, const char *prefix __maybe_unused)
        if (symbol_conf.cumulate_callchain && !callchain_param.order_set)
                callchain_param.order = ORDER_CALLER;
 
-       symbol_conf.priv_size = sizeof(struct annotation);
+       status = symbol__annotation_init();
+       if (status < 0)
+               goto out_delete_evlist;
 
        symbol_conf.try_vmlinux_path = (symbol_conf.vmlinux_name == NULL);
        if (symbol__init(NULL) < 0)
index b8c6766301db90ddbe1dfa1fb65d2ab0a024ecc7..b4fc1ab3d2a706b3b684d3699988fcb7bd663900 100644 (file)
@@ -45,6 +45,7 @@
 #include <linux/audit.h>
 #include <linux/random.h>
 #include <linux/stringify.h>
+#include <linux/time64.h>
 
 #ifndef O_CLOEXEC
 # define O_CLOEXEC             02000000
index 7ed72a475c57b217ff88504fd2903496f2daf25c..e4b717e9eb6cf532c2d9c823c1b8f71e8da4256b 100644 (file)
@@ -20,7 +20,6 @@
 #endif
 
 #ifdef __powerpc__
-#include "../../arch/powerpc/include/uapi/asm/unistd.h"
 #define CPUINFO_PROC   {"cpu"}
 #endif
 
index cb0f1356ff8149b4d0790e3d181cd6802f6886f0..9a0236a4cf95fb7226af43e33b88e351359b8a2b 100644 (file)
@@ -14,13 +14,6 @@ void test_attr__open(struct perf_event_attr *attr, pid_t pid, int cpu,
 #define HAVE_ATTR_TEST
 #include "perf-sys.h"
 
-#ifndef NSEC_PER_SEC
-# define NSEC_PER_SEC                  1000000000ULL
-#endif
-#ifndef NSEC_PER_USEC
-# define NSEC_PER_USEC                 1000ULL
-#endif
-
 static inline unsigned long long rdclock(void)
 {
        struct timespec ts;
index 615780cbfe1d86dce93bc4dfb60e6850a42797b0..e6d1816e431a9c3bd0839db29dd8b231c40b8455 100644 (file)
@@ -97,7 +97,7 @@ int test__backward_ring_buffer(int subtest __maybe_unused)
 
        evlist = perf_evlist__new();
        if (!evlist) {
-               pr_debug("No ehough memory to create evlist\n");
+               pr_debug("No enough memory to create evlist\n");
                return TEST_FAIL;
        }
 
index fc54064b91860edf0b1d173f1c05eee2bac4762c..2673e86ed50fad3496e1b50c7770142e21d6d32c 100644 (file)
@@ -125,7 +125,7 @@ static int do_test(struct bpf_object *obj, int (*func)(void),
        /* Instead of perf_evlist__new_default, don't add default events */
        evlist = perf_evlist__new();
        if (!evlist) {
-               pr_debug("No ehough memory to create evlist\n");
+               pr_debug("No enough memory to create evlist\n");
                return TEST_FAIL;
        }
 
index 2af156a8d4e5d9a1fe5ebfcc435f0c5540f17399..ff5bc6363a79de05084aca11ec856f468ce9903d 100644 (file)
@@ -263,7 +263,7 @@ static int read_object_code(u64 addr, size_t len, u8 cpumode,
         * Converting addresses for use by objdump requires more information.
         * map__load() does that.  See map__rip_2objdump() for details.
         */
-       if (map__load(al.map, NULL))
+       if (map__load(al.map))
                return -1;
 
        /* objdump struggles with kcore - try each map only once */
@@ -511,7 +511,7 @@ static int do_test_code_reading(bool try_kcore)
 
        /* Load kernel map */
        map = machine__kernel_map(machine);
-       ret = map__load(map, NULL);
+       ret = map__load(map);
        if (ret < 0) {
                pr_debug("map__load failed\n");
                goto out_err;
index e63abab7d5a17c1f283b3a90a68e19918edf420d..a5082331f2464929ed4ff6d3cae9869c41b1772e 100644 (file)
@@ -8,14 +8,6 @@
 #include "debug.h"
 #include "machine.h"
 
-static int vmlinux_matches_kallsyms_filter(struct map *map __maybe_unused,
-                                          struct symbol *sym)
-{
-       bool *visited = symbol__priv(sym);
-       *visited = true;
-       return 0;
-}
-
 #define UM(x) kallsyms_map->unmap_ip(kallsyms_map, (x))
 
 int test__vmlinux_matches_kallsyms(int subtest __maybe_unused)
@@ -28,6 +20,7 @@ int test__vmlinux_matches_kallsyms(int subtest __maybe_unused)
        enum map_type type = MAP__FUNCTION;
        struct maps *maps = &vmlinux.kmaps.maps[type];
        u64 mem_start, mem_end;
+       bool header_printed;
 
        /*
         * Step 1:
@@ -61,7 +54,7 @@ int test__vmlinux_matches_kallsyms(int subtest __maybe_unused)
         * be compacted against the list of modules found in the "vmlinux"
         * code and with the one got from /proc/modules from the "kallsyms" code.
         */
-       if (__machine__load_kallsyms(&kallsyms, "/proc/kallsyms", type, true, NULL) <= 0) {
+       if (__machine__load_kallsyms(&kallsyms, "/proc/kallsyms", type, true) <= 0) {
                pr_debug("dso__load_kallsyms ");
                goto out;
        }
@@ -99,8 +92,7 @@ int test__vmlinux_matches_kallsyms(int subtest __maybe_unused)
         * maps__reloc_vmlinux will notice and set proper ->[un]map_ip routines
         * to fixup the symbols.
         */
-       if (machine__load_vmlinux_path(&vmlinux, type,
-                                      vmlinux_matches_kallsyms_filter) <= 0) {
+       if (machine__load_vmlinux_path(&vmlinux, type) <= 0) {
                pr_debug("Couldn't find a vmlinux that matches the kernel running on this machine, skipping test\n");
                err = TEST_SKIP;
                goto out;
@@ -126,7 +118,7 @@ int test__vmlinux_matches_kallsyms(int subtest __maybe_unused)
                mem_end = vmlinux_map->unmap_ip(vmlinux_map, sym->end);
 
                first_pair = machine__find_kernel_symbol(&kallsyms, type,
-                                                        mem_start, NULL, NULL);
+                                                        mem_start, NULL);
                pair = first_pair;
 
                if (pair && UM(pair->start) == mem_start) {
@@ -143,7 +135,7 @@ next_pair:
                                 */
                                s64 skew = mem_end - UM(pair->end);
                                if (llabs(skew) >= page_size)
-                                       pr_debug("%#" PRIx64 ": diff end addr for %s v: %#" PRIx64 " k: %#" PRIx64 "\n",
+                                       pr_debug("WARN: %#" PRIx64 ": diff end addr for %s v: %#" PRIx64 " k: %#" PRIx64 "\n",
                                                 mem_start, sym->name, mem_end,
                                                 UM(pair->end));
 
@@ -154,22 +146,23 @@ next_pair:
                                 * kallsyms.
                                 */
                                continue;
-
                        } else {
-                               pair = machine__find_kernel_symbol_by_name(&kallsyms, type, sym->name, NULL, NULL);
+                               pair = machine__find_kernel_symbol_by_name(&kallsyms, type, sym->name, NULL);
                                if (pair) {
                                        if (UM(pair->start) == mem_start)
                                                goto next_pair;
 
-                                       pr_debug("%#" PRIx64 ": diff name v: %s k: %s\n",
+                                       pr_debug("WARN: %#" PRIx64 ": diff name v: %s k: %s\n",
                                                 mem_start, sym->name, pair->name);
                                } else {
-                                       pr_debug("%#" PRIx64 ": diff name v: %s k: %s\n",
+                                       pr_debug("WARN: %#" PRIx64 ": diff name v: %s k: %s\n",
                                                 mem_start, sym->name, first_pair->name);
                                }
+
+                               continue;
                        }
                } else
-                       pr_debug("%#" PRIx64 ": %s not on kallsyms\n",
+                       pr_debug("ERR : %#" PRIx64 ": %s not on kallsyms\n",
                                 mem_start, sym->name);
 
                err = -1;
@@ -178,7 +171,7 @@ next_pair:
        if (!verbose)
                goto out;
 
-       pr_info("Maps only in vmlinux:\n");
+       header_printed = false;
 
        for (map = maps__first(maps); map; map = map__next(map)) {
                struct map *
@@ -192,13 +185,18 @@ next_pair:
                                                (map->dso->kernel ?
                                                        map->dso->short_name :
                                                        map->dso->name));
-               if (pair)
+               if (pair) {
                        pair->priv = 1;
-               else
+               } else {
+                       if (!header_printed) {
+                               pr_info("WARN: Maps only in vmlinux:\n");
+                               header_printed = true;
+                       }
                        map__fprintf(map, stderr);
+               }
        }
 
-       pr_info("Maps in vmlinux with a different name in kallsyms:\n");
+       header_printed = false;
 
        for (map = maps__first(maps); map; map = map__next(map)) {
                struct map *pair;
@@ -211,24 +209,33 @@ next_pair:
                        continue;
 
                if (pair->start == mem_start) {
-                       pair->priv = 1;
-                       pr_info(" %" PRIx64 "-%" PRIx64 " %" PRIx64 " %s in kallsyms as",
+                       if (!header_printed) {
+                               pr_info("WARN: Maps in vmlinux with a different name in kallsyms:\n");
+                               header_printed = true;
+                       }
+
+                       pr_info("WARN: %" PRIx64 "-%" PRIx64 " %" PRIx64 " %s in kallsyms as",
                                map->start, map->end, map->pgoff, map->dso->name);
                        if (mem_end != pair->end)
-                               pr_info(":\n*%" PRIx64 "-%" PRIx64 " %" PRIx64,
+                               pr_info(":\nWARN: *%" PRIx64 "-%" PRIx64 " %" PRIx64,
                                        pair->start, pair->end, pair->pgoff);
                        pr_info(" %s\n", pair->dso->name);
                        pair->priv = 1;
                }
        }
 
-       pr_info("Maps only in kallsyms:\n");
+       header_printed = false;
 
        maps = &kallsyms.kmaps.maps[type];
 
        for (map = maps__first(maps); map; map = map__next(map)) {
-               if (!map->priv)
+               if (!map->priv) {
+                       if (!header_printed) {
+                               pr_info("WARN: Maps only in kallsyms:\n");
+                               header_printed = true;
+                       }
                        map__fprintf(map, stderr);
+               }
        }
 out:
        machine__exit(&kallsyms);
index 2e2d10022355ba0303f4462a21758eec773986d0..4c18271c71c9a14458e03624c110f771d2513060 100644 (file)
@@ -495,7 +495,7 @@ static bool annotate_browser__callq(struct annotate_browser *browser,
        if (!ins__is_call(dl->ins))
                return false;
 
-       if (map_groups__find_ams(&target, NULL) ||
+       if (map_groups__find_ams(&target) ||
            map__rip_2objdump(target.map, target.map->map_ip(target.map,
                                                             target.addr)) !=
            dl->ops.target.addr) {
index 13d414384739d9673089958aa2daa56de9a5b255..f0611c937d4b8c5f492c2f8e23db88a343b5ab97 100644 (file)
@@ -69,8 +69,11 @@ static u32 hist_browser__nr_entries(struct hist_browser *hb)
 static void hist_browser__update_rows(struct hist_browser *hb)
 {
        struct ui_browser *browser = &hb->b;
-       u16 header_offset = hb->show_headers ? 1 : 0, index_row;
+       struct hists *hists = hb->hists;
+       struct perf_hpp_list *hpp_list = hists->hpp_list;
+       u16 header_offset, index_row;
 
+       header_offset = hb->show_headers ? hpp_list->nr_header_lines : 0;
        browser->rows = browser->height - header_offset;
        /*
         * Verify if we were at the last line and that line isn't
@@ -99,8 +102,11 @@ static void hist_browser__refresh_dimensions(struct ui_browser *browser)
 
 static void hist_browser__gotorc(struct hist_browser *browser, int row, int column)
 {
-       u16 header_offset = browser->show_headers ? 1 : 0;
+       struct hists *hists = browser->hists;
+       struct perf_hpp_list *hpp_list = hists->hpp_list;
+       u16 header_offset;
 
+       header_offset = browser->show_headers ? hpp_list->nr_header_lines : 0;
        ui_browser__gotorc(&browser->b, row + header_offset, column);
 }
 
@@ -1496,7 +1502,9 @@ static int advance_hpp_check(struct perf_hpp *hpp, int inc)
        return hpp->size <= 0;
 }
 
-static int hists_browser__scnprintf_headers(struct hist_browser *browser, char *buf, size_t size)
+static int
+hists_browser__scnprintf_headers(struct hist_browser *browser, char *buf,
+                                size_t size, int line)
 {
        struct hists *hists = browser->hists;
        struct perf_hpp dummy_hpp = {
@@ -1506,6 +1514,7 @@ static int hists_browser__scnprintf_headers(struct hist_browser *browser, char *
        struct perf_hpp_fmt *fmt;
        size_t ret = 0;
        int column = 0;
+       int span = 0;
 
        if (symbol_conf.use_callchain) {
                ret = scnprintf(buf, size, "  ");
@@ -1517,10 +1526,13 @@ static int hists_browser__scnprintf_headers(struct hist_browser *browser, char *
                if (perf_hpp__should_skip(fmt, hists)  || column++ < browser->b.horiz_scroll)
                        continue;
 
-               ret = fmt->header(fmt, &dummy_hpp, hists);
+               ret = fmt->header(fmt, &dummy_hpp, hists, line, &span);
                if (advance_hpp_check(&dummy_hpp, ret))
                        break;
 
+               if (span)
+                       continue;
+
                ret = scnprintf(dummy_hpp.buf, dummy_hpp.size, "  ");
                if (advance_hpp_check(&dummy_hpp, ret))
                        break;
@@ -1554,7 +1566,7 @@ static int hists_browser__scnprintf_hierarchy_headers(struct hist_browser *brows
                if (column++ < browser->b.horiz_scroll)
                        continue;
 
-               ret = fmt->header(fmt, &dummy_hpp, hists);
+               ret = fmt->header(fmt, &dummy_hpp, hists, 0, NULL);
                if (advance_hpp_check(&dummy_hpp, ret))
                        break;
 
@@ -1591,7 +1603,7 @@ static int hists_browser__scnprintf_hierarchy_headers(struct hist_browser *brows
                        }
                        first_col = false;
 
-                       ret = fmt->header(fmt, &dummy_hpp, hists);
+                       ret = fmt->header(fmt, &dummy_hpp, hists, 0, NULL);
                        dummy_hpp.buf[ret] = '\0';
 
                        start = trim(dummy_hpp.buf);
@@ -1622,14 +1634,21 @@ static void hists_browser__hierarchy_headers(struct hist_browser *browser)
 
 static void hists_browser__headers(struct hist_browser *browser)
 {
-       char headers[1024];
+       struct hists *hists = browser->hists;
+       struct perf_hpp_list *hpp_list = hists->hpp_list;
 
-       hists_browser__scnprintf_headers(browser, headers,
-                                        sizeof(headers));
+       int line;
 
-       ui_browser__gotorc(&browser->b, 0, 0);
-       ui_browser__set_color(&browser->b, HE_COLORSET_ROOT);
-       ui_browser__write_nstring(&browser->b, headers, browser->b.width + 1);
+       for (line = 0; line < hpp_list->nr_header_lines; line++) {
+               char headers[1024];
+
+               hists_browser__scnprintf_headers(browser, headers,
+                                                sizeof(headers), line);
+
+               ui_browser__gotorc(&browser->b, line, 0);
+               ui_browser__set_color(&browser->b, HE_COLORSET_ROOT);
+               ui_browser__write_nstring(&browser->b, headers, browser->b.width + 1);
+       }
 }
 
 static void hist_browser__show_headers(struct hist_browser *browser)
@@ -1656,10 +1675,13 @@ static unsigned int hist_browser__refresh(struct ui_browser *browser)
        u16 header_offset = 0;
        struct rb_node *nd;
        struct hist_browser *hb = container_of(browser, struct hist_browser, b);
+       struct hists *hists = hb->hists;
 
        if (hb->show_headers) {
+               struct perf_hpp_list *hpp_list = hists->hpp_list;
+
                hist_browser__show_headers(hb);
-               header_offset = 1;
+               header_offset = hpp_list->nr_header_lines;
        }
 
        ui_browser__hists_init_top(browser);
@@ -2418,8 +2440,6 @@ do_zoom_dso(struct hist_browser *browser, struct popup_action *act)
                browser->hists->dso_filter = NULL;
                ui_helpline__pop();
        } else {
-               if (map == NULL)
-                       return 0;
                ui_helpline__fpush("To zoom out press ESC or ENTER + \"Zoom out of %s DSO\"",
                                   __map__is_kernel(map) ? "the Kernel" : map->dso->short_name);
                browser->hists->dso_filter = map->dso;
index 80912778bb6d5623bd6555fd7f843a84acb1d51a..98a34664bb7eb16667ee7169129e7e5b5b8022db 100644 (file)
@@ -52,9 +52,9 @@ static int map_browser__search(struct map_browser *browser)
 
        if (target[0] == '0' && tolower(target[1]) == 'x') {
                u64 addr = strtoull(target, NULL, 16);
-               sym = map__find_symbol(browser->map, addr, NULL);
+               sym = map__find_symbol(browser->map, addr);
        } else
-               sym = map__find_symbol_by_name(browser->map, target, NULL);
+               sym = map__find_symbol_by_name(browser->map, target);
 
        if (sym != NULL) {
                u32 *idx = symbol__browser_index(sym);
index c5f3677f66797bdbf34c171cfc398009a15b60cd..a4f02de7c1b54d426bb1b3f50b7fc2ba39ae5fa8 100644 (file)
@@ -549,7 +549,7 @@ static void perf_gtk__show_hierarchy(GtkWidget *window, struct hists *hists,
                                strcat(buf, "+");
                        first_col = false;
 
-                       fmt->header(fmt, &hpp, hists);
+                       fmt->header(fmt, &hpp, hists, 0, NULL);
                        strcat(buf, ltrim(rtrim(hpp.buf)));
                }
        }
index 4274969ddc89471e962dcf0097314b73963c65be..b47fafc8ee2aab20878cba272666fd878fd18458 100644 (file)
@@ -230,7 +230,8 @@ static int hpp__width_fn(struct perf_hpp_fmt *fmt,
 }
 
 static int hpp__header_fn(struct perf_hpp_fmt *fmt, struct perf_hpp *hpp,
-                         struct hists *hists)
+                         struct hists *hists, int line __maybe_unused,
+                         int *span __maybe_unused)
 {
        int len = hpp__width_fn(fmt, hpp, hists);
        return scnprintf(hpp->buf, hpp->size, "%*s", len, fmt->name);
@@ -441,6 +442,7 @@ struct perf_hpp_fmt perf_hpp__format[] = {
 struct perf_hpp_list perf_hpp_list = {
        .fields = LIST_HEAD_INIT(perf_hpp_list.fields),
        .sorts  = LIST_HEAD_INIT(perf_hpp_list.sorts),
+       .nr_header_lines = 1,
 };
 
 #undef HPP__COLOR_PRINT_FNS
index f04a6311207935fc022ca0b9557db8639d6c00a9..9b65f4a6b35a4b24877fac72e24e23a27ba10ae0 100644 (file)
@@ -549,7 +549,7 @@ static int print_hierarchy_header(struct hists *hists, struct perf_hpp *hpp,
                                    struct perf_hpp_list_node, list);
 
        perf_hpp_list__for_each_format(&fmt_node->hpp, fmt) {
-               fmt->header(fmt, hpp, hists);
+               fmt->header(fmt, hpp, hists, 0, NULL);
                fprintf(fp, "%s%s", hpp->buf, sep ?: "  ");
        }
 
@@ -569,7 +569,7 @@ static int print_hierarchy_header(struct hists *hists, struct perf_hpp *hpp,
                                header_width += fprintf(fp, "+");
                        first_col = false;
 
-                       fmt->header(fmt, hpp, hists);
+                       fmt->header(fmt, hpp, hists, 0, NULL);
 
                        header_width += fprintf(fp, "%s", trim(hpp->buf));
                }
@@ -639,33 +639,52 @@ hists__fprintf_hierarchy_headers(struct hists *hists,
        return print_hierarchy_header(hists, hpp, symbol_conf.field_sep, fp);
 }
 
-static int
-hists__fprintf_standard_headers(struct hists *hists,
-                               struct perf_hpp *hpp,
-                               FILE *fp)
+static void fprintf_line(struct hists *hists, struct perf_hpp *hpp,
+                        int line, FILE *fp)
 {
        struct perf_hpp_fmt *fmt;
-       unsigned int width;
        const char *sep = symbol_conf.field_sep;
        bool first = true;
+       int span = 0;
 
        hists__for_each_format(hists, fmt) {
                if (perf_hpp__should_skip(fmt, hists))
                        continue;
 
-               if (!first)
+               if (!first && !span)
                        fprintf(fp, "%s", sep ?: "  ");
                else
                        first = false;
 
-               fmt->header(fmt, hpp, hists);
-               fprintf(fp, "%s", hpp->buf);
+               fmt->header(fmt, hpp, hists, line, &span);
+
+               if (!span)
+                       fprintf(fp, "%s", hpp->buf);
        }
+}
 
-       fprintf(fp, "\n");
+static int
+hists__fprintf_standard_headers(struct hists *hists,
+                               struct perf_hpp *hpp,
+                               FILE *fp)
+{
+       struct perf_hpp_list *hpp_list = hists->hpp_list;
+       struct perf_hpp_fmt *fmt;
+       unsigned int width;
+       const char *sep = symbol_conf.field_sep;
+       bool first = true;
+       int line;
+
+       for (line = 0; line < hpp_list->nr_header_lines; line++) {
+               /* first # is displayed one level up */
+               if (line)
+                       fprintf(fp, "# ");
+               fprintf_line(hists, hpp, line, fp);
+               fprintf(fp, "\n");
+       }
 
        if (sep)
-               return 1;
+               return hpp_list->nr_header_lines;
 
        first = true;
 
@@ -689,7 +708,7 @@ hists__fprintf_standard_headers(struct hists *hists,
 
        fprintf(fp, "\n");
        fprintf(fp, "#\n");
-       return 3;
+       return hpp_list->nr_header_lines + 2;
 }
 
 static int hists__fprintf_headers(struct hists *hists, FILE *fp)
index 91c5f6e1af59a23e23013f52d4058dca9825ff7c..96f99d608d0049491cae7cb6d3907390afaee562 100644 (file)
@@ -1,5 +1,6 @@
 libperf-y += alias.o
 libperf-y += annotate.o
+libperf-y += block-range.o
 libperf-y += build-id.o
 libperf-y += config.o
 libperf-y += ctype.o
@@ -98,6 +99,7 @@ endif
 
 libperf-$(CONFIG_DWARF) += probe-finder.o
 libperf-$(CONFIG_DWARF) += dwarf-aux.o
+libperf-$(CONFIG_DWARF) += dwarf-regs.o
 
 libperf-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw.o
 libperf-$(CONFIG_LOCAL_LIBUNWIND)    += unwind-libunwind-local.o
index 4024d309bb00adaa2dbdb0a6e80afa2e6a5935a9..7a80c7362a037b90e5da9f996bc34947c2c7db85 100644 (file)
@@ -17,6 +17,7 @@
 #include "debug.h"
 #include "annotate.h"
 #include "evsel.h"
+#include "block-range.h"
 #include <regex.h>
 #include <pthread.h>
 #include <linux/bitops.h>
@@ -491,13 +492,6 @@ static struct ins *ins__find(const char *name)
        return bsearch(name, instructions, nmemb, sizeof(struct ins), ins__key_cmp);
 }
 
-int symbol__annotate_init(struct map *map __maybe_unused, struct symbol *sym)
-{
-       struct annotation *notes = symbol__annotation(sym);
-       pthread_mutex_init(&notes->lock, NULL);
-       return 0;
-}
-
 int symbol__alloc_hist(struct symbol *sym)
 {
        struct annotation *notes = symbol__annotation(sym);
@@ -866,6 +860,89 @@ double disasm__calc_percent(struct annotation *notes, int evidx, s64 offset,
        return percent;
 }
 
+static const char *annotate__address_color(struct block_range *br)
+{
+       double cov = block_range__coverage(br);
+
+       if (cov >= 0) {
+               /* mark red for >75% coverage */
+               if (cov > 0.75)
+                       return PERF_COLOR_RED;
+
+               /* mark dull for <1% coverage */
+               if (cov < 0.01)
+                       return PERF_COLOR_NORMAL;
+       }
+
+       return PERF_COLOR_MAGENTA;
+}
+
+static const char *annotate__asm_color(struct block_range *br)
+{
+       double cov = block_range__coverage(br);
+
+       if (cov >= 0) {
+               /* mark dull for <1% coverage */
+               if (cov < 0.01)
+                       return PERF_COLOR_NORMAL;
+       }
+
+       return PERF_COLOR_BLUE;
+}
+
+static void annotate__branch_printf(struct block_range *br, u64 addr)
+{
+       bool emit_comment = true;
+
+       if (!br)
+               return;
+
+#if 1
+       if (br->is_target && br->start == addr) {
+               struct block_range *branch = br;
+               double p;
+
+               /*
+                * Find matching branch to our target.
+                */
+               while (!branch->is_branch)
+                       branch = block_range__next(branch);
+
+               p = 100 *(double)br->entry / branch->coverage;
+
+               if (p > 0.1) {
+                       if (emit_comment) {
+                               emit_comment = false;
+                               printf("\t#");
+                       }
+
+                       /*
+                        * The percentage of coverage joined at this target in relation
+                        * to the next branch.
+                        */
+                       printf(" +%.2f%%", p);
+               }
+       }
+#endif
+       if (br->is_branch && br->end == addr) {
+               double p = 100*(double)br->taken / br->coverage;
+
+               if (p > 0.1) {
+                       if (emit_comment) {
+                               emit_comment = false;
+                               printf("\t#");
+                       }
+
+                       /*
+                        * The percentage of coverage leaving at this branch, and
+                        * its prediction ratio.
+                        */
+                       printf(" -%.2f%% (p:%.2f%%)", p, 100*(double)br->pred  / br->taken);
+               }
+       }
+}
+
+
 static int disasm_line__print(struct disasm_line *dl, struct symbol *sym, u64 start,
                      struct perf_evsel *evsel, u64 len, int min_pcnt, int printed,
                      int max_lines, struct disasm_line *queue)
@@ -885,6 +962,7 @@ static int disasm_line__print(struct disasm_line *dl, struct symbol *sym, u64 st
                s64 offset = dl->offset;
                const u64 addr = start + offset;
                struct disasm_line *next;
+               struct block_range *br;
 
                next = disasm__get_next_ip_line(&notes->src->source, dl);
 
@@ -954,8 +1032,12 @@ static int disasm_line__print(struct disasm_line *dl, struct symbol *sym, u64 st
                }
 
                printf(" :      ");
-               color_fprintf(stdout, PERF_COLOR_MAGENTA, "  %" PRIx64 ":", addr);
-               color_fprintf(stdout, PERF_COLOR_BLUE, "%s\n", dl->line);
+
+               br = block_range__find(addr);
+               color_fprintf(stdout, annotate__address_color(br), "  %" PRIx64 ":", addr);
+               color_fprintf(stdout, annotate__asm_color(br), "%s", dl->line);
+               annotate__branch_printf(br, addr);
+               printf("\n");
 
                if (ppercents != &percent)
                        free(ppercents);
@@ -1084,7 +1166,7 @@ static int symbol__parse_objdump_line(struct symbol *sym, struct map *map,
                        .addr = dl->ops.target.addr,
                };
 
-               if (!map_groups__find_ams(&target, NULL) &&
+               if (!map_groups__find_ams(&target) &&
                    target.sym->start == target.al_addr)
                        dl->ops.target.name = strdup(target.sym->name);
        }
@@ -1162,53 +1244,60 @@ int symbol__strerror_disassemble(struct symbol *sym __maybe_unused, struct map *
        return 0;
 }
 
-int symbol__disassemble(struct symbol *sym, struct map *map, size_t privsize)
+static int dso__disassemble_filename(struct dso *dso, char *filename, size_t filename_size)
 {
-       struct dso *dso = map->dso;
-       char *filename = dso__build_id_filename(dso, NULL, 0);
-       bool free_filename = true;
-       char command[PATH_MAX * 2];
-       FILE *file;
-       int err = 0;
-       char symfs_filename[PATH_MAX];
-       struct kcore_extract kce;
-       bool delete_extract = false;
-       int stdout_fd[2];
-       int lineno = 0;
-       int nline;
-       pid_t pid;
+       char linkname[PATH_MAX];
+       char *build_id_filename;
 
-       if (filename)
-               symbol__join_symfs(symfs_filename, filename);
+       if (dso->symtab_type == DSO_BINARY_TYPE__KALLSYMS &&
+           !dso__is_kcore(dso))
+               return SYMBOL_ANNOTATE_ERRNO__NO_VMLINUX;
 
-       if (filename == NULL) {
+       build_id_filename = dso__build_id_filename(dso, NULL, 0);
+       if (build_id_filename) {
+               __symbol__join_symfs(filename, filename_size, build_id_filename);
+               free(build_id_filename);
+       } else {
                if (dso->has_build_id)
                        return ENOMEM;
                goto fallback;
-       } else if (dso__is_kcore(dso) ||
-                  readlink(symfs_filename, command, sizeof(command)) < 0 ||
-                  strstr(command, DSO__NAME_KALLSYMS) ||
-                  access(symfs_filename, R_OK)) {
-               free(filename);
+       }
+
+       if (dso__is_kcore(dso) ||
+           readlink(filename, linkname, sizeof(linkname)) < 0 ||
+           strstr(linkname, DSO__NAME_KALLSYMS) ||
+           access(filename, R_OK)) {
 fallback:
                /*
                 * If we don't have build-ids or the build-id file isn't in the
                 * cache, or is just a kallsyms file, well, lets hope that this
                 * DSO is the same as when 'perf record' ran.
                 */
-               filename = (char *)dso->long_name;
-               symbol__join_symfs(symfs_filename, filename);
-               free_filename = false;
+               __symbol__join_symfs(filename, filename_size, dso->long_name);
        }
 
-       if (dso->symtab_type == DSO_BINARY_TYPE__KALLSYMS &&
-           !dso__is_kcore(dso)) {
-               err = SYMBOL_ANNOTATE_ERRNO__NO_VMLINUX;
-               goto out_free_filename;
-       }
+       return 0;
+}
+
+int symbol__disassemble(struct symbol *sym, struct map *map, size_t privsize)
+{
+       struct dso *dso = map->dso;
+       char command[PATH_MAX * 2];
+       FILE *file;
+       char symfs_filename[PATH_MAX];
+       struct kcore_extract kce;
+       bool delete_extract = false;
+       int stdout_fd[2];
+       int lineno = 0;
+       int nline;
+       pid_t pid;
+       int err = dso__disassemble_filename(dso, symfs_filename, sizeof(symfs_filename));
+
+       if (err)
+               return err;
 
        pr_debug("%s: filename=%s, sym=%s, start=%#" PRIx64 ", end=%#" PRIx64 "\n", __func__,
-                filename, sym->name, map->unmap_ip(map, sym->start),
+                symfs_filename, sym->name, map->unmap_ip(map, sym->start),
                 map->unmap_ip(map, sym->end));
 
        pr_debug("annotating [%p] %30s : [%p] %30s\n",
@@ -1223,11 +1312,6 @@ fallback:
                        delete_extract = true;
                        strlcpy(symfs_filename, kce.extract_filename,
                                sizeof(symfs_filename));
-                       if (free_filename) {
-                               free(filename);
-                               free_filename = false;
-                       }
-                       filename = symfs_filename;
                }
        } else if (dso__needs_decompress(dso)) {
                char tmp[PATH_MAX];
@@ -1236,14 +1320,14 @@ fallback:
                bool ret;
 
                if (kmod_path__parse_ext(&m, symfs_filename))
-                       goto out_free_filename;
+                       goto out;
 
                snprintf(tmp, PATH_MAX, "/tmp/perf-kmod-XXXXXX");
 
                fd = mkstemp(tmp);
                if (fd < 0) {
                        free(m.ext);
-                       goto out_free_filename;
+                       goto out;
                }
 
                ret = decompress_to_file(m.ext, symfs_filename, fd);
@@ -1255,7 +1339,7 @@ fallback:
                close(fd);
 
                if (!ret)
-                       goto out_free_filename;
+                       goto out;
 
                strcpy(symfs_filename, tmp);
        }
@@ -1271,7 +1355,7 @@ fallback:
                 map__rip_2objdump(map, sym->end),
                 symbol_conf.annotate_asm_raw ? "" : "--no-show-raw",
                 symbol_conf.annotate_src ? "-S" : "",
-                symfs_filename, filename);
+                symfs_filename, symfs_filename);
 
        pr_debug("Executing: %s\n", command);
 
@@ -1333,11 +1417,10 @@ out_remove_tmp:
 
        if (dso__needs_decompress(dso))
                unlink(symfs_filename);
-out_free_filename:
+
        if (delete_extract)
                kcore_extract__delete(&kce);
-       if (free_filename)
-               free(filename);
+out:
        return err;
 
 out_close_stdout:
index f67ccb0275615c1632d86eb5f09815eaf596b905..ea44e4ff19c658188c1288f576f72fc7f170559b 100644 (file)
@@ -130,6 +130,7 @@ struct annotated_source {
 
 struct annotation {
        pthread_mutex_t         lock;
+       u64                     max_coverage;
        struct annotated_source *src;
 };
 
@@ -177,7 +178,6 @@ enum symbol_disassemble_errno {
 int symbol__strerror_disassemble(struct symbol *sym, struct map *map,
                                 int errnum, char *buf, size_t buflen);
 
-int symbol__annotate_init(struct map *map, struct symbol *sym);
 int symbol__annotate_printf(struct symbol *sym, struct map *map,
                            struct perf_evsel *evsel, bool full_paths,
                            int min_pcnt, int max_lines, int context);
diff --git a/tools/perf/util/block-range.c b/tools/perf/util/block-range.c
new file mode 100644 (file)
index 0000000..7b3e1d7
--- /dev/null
@@ -0,0 +1,328 @@
+#include "block-range.h"
+#include "annotate.h"
+
+struct {
+       struct rb_root root;
+       u64 blocks;
+} block_ranges;
+
+static void block_range__debug(void)
+{
+       /*
+        * XXX still paranoid for now; see if we can make this depend on
+        * DEBUG=1 builds.
+        */
+#if 1
+       struct rb_node *rb;
+       u64 old = 0; /* NULL isn't executable */
+
+       for (rb = rb_first(&block_ranges.root); rb; rb = rb_next(rb)) {
+               struct block_range *entry = rb_entry(rb, struct block_range, node);
+
+               assert(old < entry->start);
+               assert(entry->start <= entry->end); /* single instruction block; jump to a jump */
+
+               old = entry->end;
+       }
+#endif
+}
+
+struct block_range *block_range__find(u64 addr)
+{
+       struct rb_node **p = &block_ranges.root.rb_node;
+       struct rb_node *parent = NULL;
+       struct block_range *entry;
+
+       while (*p != NULL) {
+               parent = *p;
+               entry = rb_entry(parent, struct block_range, node);
+
+               if (addr < entry->start)
+                       p = &parent->rb_left;
+               else if (addr > entry->end)
+                       p = &parent->rb_right;
+               else
+                       return entry;
+       }
+
+       return NULL;
+}
+
+static inline void rb_link_left_of_node(struct rb_node *left, struct rb_node *node)
+{
+       struct rb_node **p = &node->rb_left;
+       while (*p) {
+               node = *p;
+               p = &node->rb_right;
+       }
+       rb_link_node(left, node, p);
+}
+
+static inline void rb_link_right_of_node(struct rb_node *right, struct rb_node *node)
+{
+       struct rb_node **p = &node->rb_right;
+       while (*p) {
+               node = *p;
+               p = &node->rb_left;
+       }
+       rb_link_node(right, node, p);
+}
+
+/**
+ * block_range__create
+ * @start: branch target starting this basic block
+ * @end:   branch ending this basic block
+ *
+ * Create all the required block ranges to precisely span the given range.
+ */
+struct block_range_iter block_range__create(u64 start, u64 end)
+{
+       struct rb_node **p = &block_ranges.root.rb_node;
+       struct rb_node *n, *parent = NULL;
+       struct block_range *next, *entry = NULL;
+       struct block_range_iter iter = { NULL, NULL };
+
+       while (*p != NULL) {
+               parent = *p;
+               entry = rb_entry(parent, struct block_range, node);
+
+               if (start < entry->start)
+                       p = &parent->rb_left;
+               else if (start > entry->end)
+                       p = &parent->rb_right;
+               else
+                       break;
+       }
+
+       /*
+        * Didn't find anything.. there's a hole at @start, however @end might
+        * be inside/behind the next range.
+        */
+       if (!*p) {
+               if (!entry) /* tree empty */
+                       goto do_whole;
+
+               /*
+                * If the last node is before, advance one to find the next.
+                */
+               n = parent;
+               if (entry->end < start) {
+                       n = rb_next(n);
+                       if (!n)
+                               goto do_whole;
+               }
+               next = rb_entry(n, struct block_range, node);
+
+               if (next->start <= end) { /* add head: [start...][n->start...] */
+                       struct block_range *head = malloc(sizeof(struct block_range));
+                       if (!head)
+                               return iter;
+
+                       *head = (struct block_range){
+                               .start          = start,
+                               .end            = next->start - 1,
+                               .is_target      = 1,
+                               .is_branch      = 0,
+                       };
+
+                       rb_link_left_of_node(&head->node, &next->node);
+                       rb_insert_color(&head->node, &block_ranges.root);
+                       block_range__debug();
+
+                       iter.start = head;
+                       goto do_tail;
+               }
+
+do_whole:
+               /*
+                * The whole [start..end] range is non-overlapping.
+                */
+               entry = malloc(sizeof(struct block_range));
+               if (!entry)
+                       return iter;
+
+               *entry = (struct block_range){
+                       .start          = start,
+                       .end            = end,
+                       .is_target      = 1,
+                       .is_branch      = 1,
+               };
+
+               rb_link_node(&entry->node, parent, p);
+               rb_insert_color(&entry->node, &block_ranges.root);
+               block_range__debug();
+
+               iter.start = entry;
+               iter.end   = entry;
+               goto done;
+       }
+
+       /*
+        * We found a range that overlapped with ours, split if needed.
+        */
+       if (entry->start < start) { /* split: [e->start...][start...] */
+               struct block_range *head = malloc(sizeof(struct block_range));
+               if (!head)
+                       return iter;
+
+               *head = (struct block_range){
+                       .start          = entry->start,
+                       .end            = start - 1,
+                       .is_target      = entry->is_target,
+                       .is_branch      = 0,
+
+                       .coverage       = entry->coverage,
+                       .entry          = entry->entry,
+               };
+
+               entry->start            = start;
+               entry->is_target        = 1;
+               entry->entry            = 0;
+
+               rb_link_left_of_node(&head->node, &entry->node);
+               rb_insert_color(&head->node, &block_ranges.root);
+               block_range__debug();
+
+       } else if (entry->start == start)
+               entry->is_target = 1;
+
+       iter.start = entry;
+
+do_tail:
+       /*
+        * At this point we've got: @iter.start = [@start...] but @end can still be
+        * inside or beyond it.
+        */
+       entry = iter.start;
+       for (;;) {
+               /*
+                * If @end is inside @entry, split.
+                */
+               if (end < entry->end) { /* split: [...end][...e->end] */
+                       struct block_range *tail = malloc(sizeof(struct block_range));
+                       if (!tail)
+                               return iter;
+
+                       *tail = (struct block_range){
+                               .start          = end + 1,
+                               .end            = entry->end,
+                               .is_target      = 0,
+                               .is_branch      = entry->is_branch,
+
+                               .coverage       = entry->coverage,
+                               .taken          = entry->taken,
+                               .pred           = entry->pred,
+                       };
+
+                       entry->end              = end;
+                       entry->is_branch        = 1;
+                       entry->taken            = 0;
+                       entry->pred             = 0;
+
+                       rb_link_right_of_node(&tail->node, &entry->node);
+                       rb_insert_color(&tail->node, &block_ranges.root);
+                       block_range__debug();
+
+                       iter.end = entry;
+                       goto done;
+               }
+
+               /*
+                * If @end matches @entry, done
+                */
+               if (end == entry->end) {
+                       entry->is_branch = 1;
+                       iter.end = entry;
+                       goto done;
+               }
+
+               next = block_range__next(entry);
+               if (!next)
+                       goto add_tail;
+
+               /*
+                * If @end is in beyond @entry but not inside @next, add tail.
+                */
+               if (end < next->start) { /* add tail: [...e->end][...end] */
+                       struct block_range *tail;
+add_tail:
+                       tail = malloc(sizeof(struct block_range));
+                       if (!tail)
+                               return iter;
+
+                       *tail = (struct block_range){
+                               .start          = entry->end + 1,
+                               .end            = end,
+                               .is_target      = 0,
+                               .is_branch      = 1,
+                       };
+
+                       rb_link_right_of_node(&tail->node, &entry->node);
+                       rb_insert_color(&tail->node, &block_ranges.root);
+                       block_range__debug();
+
+                       iter.end = tail;
+                       goto done;
+               }
+
+               /*
+                * If there is a hole between @entry and @next, fill it.
+                */
+               if (entry->end + 1 != next->start) {
+                       struct block_range *hole = malloc(sizeof(struct block_range));
+                       if (!hole)
+                               return iter;
+
+                       *hole = (struct block_range){
+                               .start          = entry->end + 1,
+                               .end            = next->start - 1,
+                               .is_target      = 0,
+                               .is_branch      = 0,
+                       };
+
+                       rb_link_left_of_node(&hole->node, &next->node);
+                       rb_insert_color(&hole->node, &block_ranges.root);
+                       block_range__debug();
+               }
+
+               entry = next;
+       }
+
+done:
+       assert(iter.start->start == start && iter.start->is_target);
+       assert(iter.end->end == end && iter.end->is_branch);
+
+       block_ranges.blocks++;
+
+       return iter;
+}
+
+
+/*
+ * Compute coverage as:
+ *
+ *    br->coverage / br->sym->max_coverage
+ *
+ * This ensures each symbol has a 100% spot, to reflect that each symbol has a
+ * most covered section.
+ *
+ * Returns [0-1] for coverage and -1 if we had no data what so ever or the
+ * symbol does not exist.
+ */
+double block_range__coverage(struct block_range *br)
+{
+       struct symbol *sym;
+
+       if (!br) {
+               if (block_ranges.blocks)
+                       return 0;
+
+               return -1;
+       }
+
+       sym = br->sym;
+       if (!sym)
+               return -1;
+
+       return (double)br->coverage / symbol__annotation(sym)->max_coverage;
+}
diff --git a/tools/perf/util/block-range.h b/tools/perf/util/block-range.h
new file mode 100644 (file)
index 0000000..a8c8413
--- /dev/null
@@ -0,0 +1,71 @@
+#ifndef __PERF_BLOCK_RANGE_H
+#define __PERF_BLOCK_RANGE_H
+
+#include "symbol.h"
+
+/*
+ * struct block_range - non-overlapping parts of basic blocks
+ * @node:      treenode
+ * @start:     inclusive start of range
+ * @end:       inclusive end of range
+ * @is_target: @start is a jump target
+ * @is_branch: @end is a branch instruction
+ * @coverage:  number of blocks that cover this range
+ * @taken:     number of times the branch is taken (requires @is_branch)
+ * @pred:      number of times the taken branch was predicted
+ */
+struct block_range {
+       struct rb_node node;
+
+       struct symbol *sym;
+
+       u64 start;
+       u64 end;
+
+       int is_target, is_branch;
+
+       u64 coverage;
+       u64 entry;
+       u64 taken;
+       u64 pred;
+};
+
+static inline struct block_range *block_range__next(struct block_range *br)
+{
+       struct rb_node *n = rb_next(&br->node);
+       if (!n)
+               return NULL;
+       return rb_entry(n, struct block_range, node);
+}
+
+struct block_range_iter {
+       struct block_range *start;
+       struct block_range *end;
+};
+
+static inline struct block_range *block_range_iter(struct block_range_iter *iter)
+{
+       return iter->start;
+}
+
+static inline bool block_range_iter__next(struct block_range_iter *iter)
+{
+       if (iter->start == iter->end)
+               return false;
+
+       iter->start = block_range__next(iter->start);
+       return true;
+}
+
+static inline bool block_range_iter__valid(struct block_range_iter *iter)
+{
+       if (!iter->start || !iter->end)
+               return false;
+       return true;
+}
+
+extern struct block_range *block_range__find(u64 addr);
+extern struct block_range_iter block_range__create(u64 start, u64 end);
+extern double block_range__coverage(struct block_range *br);
+
+#endif /* __PERF_BLOCK_RANGE_H */
index 1f12e4e4000605e68351194731969df0c9081005..2b2c9b82f5abc55d22ff451ec32161c558091895 100644 (file)
@@ -531,7 +531,7 @@ static int map_prologue(struct perf_probe_event *pev, int *mapping,
 
        ptevs = malloc(array_sz);
        if (!ptevs) {
-               pr_debug("No ehough memory: alloc ptevs failed\n");
+               pr_debug("No enough memory: alloc ptevs failed\n");
                return -ENOMEM;
        }
 
index 8c4212abd19b48b9e84ca1f9978f06561c05576a..c1838b643108bda4d6fc536ae232f20e643a9ce2 100644 (file)
@@ -6,6 +6,7 @@
 #include <stdarg.h>
 #include <stdio.h>
 #include <api/debug.h>
+#include <linux/time64.h>
 
 #include "cache.h"
 #include "color.h"
@@ -14,9 +15,6 @@
 #include "util.h"
 #include "target.h"
 
-#define NSECS_PER_SEC  1000000000ULL
-#define NSECS_PER_USEC 1000ULL
-
 int verbose;
 bool dump_trace = false, quiet = false;
 int debug_ordered_events;
@@ -54,9 +52,9 @@ static int veprintf_time(u64 t, const char *fmt, va_list args)
        int ret = 0;
        u64 secs, usecs, nsecs = t;
 
-       secs   = nsecs / NSECS_PER_SEC;
-       nsecs -= secs  * NSECS_PER_SEC;
-       usecs  = nsecs / NSECS_PER_USEC;
+       secs   = nsecs / NSEC_PER_SEC;
+       nsecs -= secs  * NSEC_PER_SEC;
+       usecs  = nsecs / NSEC_PER_USEC;
 
        ret = fprintf(stderr, "[%13" PRIu64 ".%06" PRIu64 "] ",
                      secs, usecs);
index a347b19c961a4bf87949b0cc3ae6feb4f496e774..faec899435f2cf2ca1b035f3aff7dec9a9b47f1b 100644 (file)
@@ -1085,3 +1085,182 @@ int die_get_var_range(Dwarf_Die *sp_die __maybe_unused,
        return -ENOTSUP;
 }
 #endif
+
+/*
+ * die_has_loclist - Check if DW_AT_location of @vr_die is a location list
+ * @vr_die: a variable DIE
+ */
+static bool die_has_loclist(Dwarf_Die *vr_die)
+{
+       Dwarf_Attribute loc;
+       int tag = dwarf_tag(vr_die);
+
+       if (tag != DW_TAG_formal_parameter &&
+           tag != DW_TAG_variable)
+               return false;
+
+       return (dwarf_attr_integrate(vr_die, DW_AT_location, &loc) &&
+               dwarf_whatform(&loc) == DW_FORM_sec_offset);
+}
+
+/*
+ * die_is_optimized_target - Check if target program is compiled with
+ * optimization
+ * @cu_die: a CU DIE
+ *
+ * For any object in given CU whose DW_AT_location is a location list,
+ * target program is compiled with optimization. This is applicable to
+ * clang as well.
+ */
+bool die_is_optimized_target(Dwarf_Die *cu_die)
+{
+       Dwarf_Die tmp_die;
+
+       if (die_has_loclist(cu_die))
+               return true;
+
+       if (!dwarf_child(cu_die, &tmp_die) &&
+           die_is_optimized_target(&tmp_die))
+               return true;
+
+       if (!dwarf_siblingof(cu_die, &tmp_die) &&
+           die_is_optimized_target(&tmp_die))
+               return true;
+
+       return false;
+}
+
+/*
+ * die_search_idx - Search index of given line address
+ * @lines: Line records of single CU
+ * @nr_lines: Number of @lines
+ * @addr: address we are looking for
+ * @idx: index to be set by this function (return value)
+ *
+ * Search for @addr by looping over every lines of CU. If address
+ * matches, set index of that line in @idx. Note that single source
+ * line can have multiple line records. i.e. single source line can
+ * have multiple index.
+ */
+static bool die_search_idx(Dwarf_Lines *lines, unsigned long nr_lines,
+                          Dwarf_Addr addr, unsigned long *idx)
+{
+       unsigned long i;
+       Dwarf_Addr tmp;
+
+       for (i = 0; i < nr_lines; i++) {
+               if (dwarf_lineaddr(dwarf_onesrcline(lines, i), &tmp))
+                       return false;
+
+               if (tmp == addr) {
+                       *idx = i;
+                       return true;
+               }
+       }
+       return false;
+}
+
+/*
+ * die_get_postprologue_addr - Search next address after function prologue
+ * @entrypc_idx: entrypc index
+ * @lines: Line records of single CU
+ * @nr_lines: Number of @lines
+ * @hignpc: high PC address of function
+ * @postprologue_addr: Next address after function prologue (return value)
+ *
+ * Look for prologue-end marker. If there is no explicit marker, return
+ * address of next line record or next source line.
+ */
+static bool die_get_postprologue_addr(unsigned long entrypc_idx,
+                                     Dwarf_Lines *lines,
+                                     unsigned long nr_lines,
+                                     Dwarf_Addr highpc,
+                                     Dwarf_Addr *postprologue_addr)
+{
+       unsigned long i;
+       int entrypc_lno, lno;
+       Dwarf_Line *line;
+       Dwarf_Addr addr;
+       bool p_end;
+
+       /* entrypc_lno is actual source line number */
+       line = dwarf_onesrcline(lines, entrypc_idx);
+       if (dwarf_lineno(line, &entrypc_lno))
+               return false;
+
+       for (i = entrypc_idx; i < nr_lines; i++) {
+               line = dwarf_onesrcline(lines, i);
+
+               if (dwarf_lineaddr(line, &addr) ||
+                   dwarf_lineno(line, &lno)    ||
+                   dwarf_lineprologueend(line, &p_end))
+                       return false;
+
+               /* highpc is exclusive. [entrypc,highpc) */
+               if (addr >= highpc)
+                       break;
+
+               /* clang supports prologue-end marker */
+               if (p_end)
+                       break;
+
+               /* Actual next line in source */
+               if (lno != entrypc_lno)
+                       break;
+
+               /*
+                * Single source line can have multiple line records.
+                * For Example,
+                *     void foo() { printf("hello\n"); }
+                * contains two line records. One points to declaration and
+                * other points to printf() line. Variable 'lno' won't get
+                * incremented in this case but 'i' will.
+                */
+               if (i != entrypc_idx)
+                       break;
+       }
+
+       dwarf_lineaddr(line, postprologue_addr);
+       if (*postprologue_addr >= highpc)
+               dwarf_lineaddr(dwarf_onesrcline(lines, i - 1),
+                              postprologue_addr);
+
+       return true;
+}
+
+/*
+ * die_skip_prologue - Use next address after prologue as probe location
+ * @sp_die: a subprogram DIE
+ * @cu_die: a CU DIE
+ * @entrypc: entrypc of the function
+ *
+ * Function prologue prepares stack and registers before executing function
+ * logic. When target program is compiled without optimization, function
+ * parameter information is only valid after prologue. When we probe entrypc
+ * of the function, and try to record function parameter, it contains
+ * garbage value.
+ */
+void die_skip_prologue(Dwarf_Die *sp_die, Dwarf_Die *cu_die,
+                      Dwarf_Addr *entrypc)
+{
+       size_t nr_lines = 0;
+       unsigned long entrypc_idx = 0;
+       Dwarf_Lines *lines = NULL;
+       Dwarf_Addr postprologue_addr;
+       Dwarf_Addr highpc;
+
+       if (dwarf_highpc(sp_die, &highpc))
+               return;
+
+       if (dwarf_getsrclines(cu_die, &lines, &nr_lines))
+               return;
+
+       if (!die_search_idx(lines, nr_lines, *entrypc, &entrypc_idx))
+               return;
+
+       if (!die_get_postprologue_addr(entrypc_idx, lines, nr_lines,
+                                      highpc, &postprologue_addr))
+               return;
+
+       *entrypc = postprologue_addr;
+}
index dc0ce1adb075bf6fac0a7296fdd0050b5bdeb4b7..8b6d2f83af0210feca3b6cbd9ee190d5986de699 100644 (file)
@@ -125,4 +125,12 @@ int die_get_typename(Dwarf_Die *vr_die, struct strbuf *buf);
 /* Get the name and type of given variable DIE, stored as "type\tname" */
 int die_get_varname(Dwarf_Die *vr_die, struct strbuf *buf);
 int die_get_var_range(Dwarf_Die *sp_die, Dwarf_Die *vr_die, struct strbuf *buf);
+
+/* Check if target program is compiled with optimization */
+bool die_is_optimized_target(Dwarf_Die *cu_die);
+
+/* Use next address after prologue as probe location */
+void die_skip_prologue(Dwarf_Die *sp_die, Dwarf_Die *cu_die,
+                      Dwarf_Addr *entrypc);
+
 #endif
diff --git a/tools/perf/util/dwarf-regs.c b/tools/perf/util/dwarf-regs.c
new file mode 100644 (file)
index 0000000..62bc4a8
--- /dev/null
@@ -0,0 +1,59 @@
+/*
+ * dwarf-regs.c : Mapping of DWARF debug register numbers into register names.
+ *
+ * Written by: Masami Hiramatsu <mhiramat@kernel.org>
+ */
+
+#include <util.h>
+#include <debug.h>
+#include <dwarf-regs.h>
+#include <elf.h>
+
+#ifndef EM_AARCH64
+#define EM_AARCH64     183  /* ARM 64 bit */
+#endif
+
+/* Define const char * {arch}_register_tbl[] */
+#define DEFINE_DWARF_REGSTR_TABLE
+#include "../arch/x86/include/dwarf-regs-table.h"
+#include "../arch/arm/include/dwarf-regs-table.h"
+#include "../arch/arm64/include/dwarf-regs-table.h"
+#include "../arch/sh/include/dwarf-regs-table.h"
+#include "../arch/powerpc/include/dwarf-regs-table.h"
+#include "../arch/s390/include/dwarf-regs-table.h"
+#include "../arch/sparc/include/dwarf-regs-table.h"
+#include "../arch/xtensa/include/dwarf-regs-table.h"
+
+#define __get_dwarf_regstr(tbl, n) (((n) < ARRAY_SIZE(tbl)) ? (tbl)[(n)] : NULL)
+
+/* Return architecture dependent register string (for kprobe-tracer) */
+const char *get_dwarf_regstr(unsigned int n, unsigned int machine)
+{
+       switch (machine) {
+       case EM_NONE:   /* Generic arch - use host arch */
+               return get_arch_regstr(n);
+       case EM_386:
+               return __get_dwarf_regstr(x86_32_regstr_tbl, n);
+       case EM_X86_64:
+               return __get_dwarf_regstr(x86_64_regstr_tbl, n);
+       case EM_ARM:
+               return __get_dwarf_regstr(arm_regstr_tbl, n);
+       case EM_AARCH64:
+               return __get_dwarf_regstr(aarch64_regstr_tbl, n);
+       case EM_SH:
+               return __get_dwarf_regstr(sh_regstr_tbl, n);
+       case EM_S390:
+               return __get_dwarf_regstr(s390_regstr_tbl, n);
+       case EM_PPC:
+       case EM_PPC64:
+               return __get_dwarf_regstr(powerpc_regstr_tbl, n);
+       case EM_SPARC:
+       case EM_SPARCV9:
+               return __get_dwarf_regstr(sparc_regstr_tbl, n);
+       case EM_XTENSA:
+               return __get_dwarf_regstr(xtensa_regstr_tbl, n);
+       default:
+               pr_err("ELF MACHINE %x is not supported.\n", machine);
+       }
+       return NULL;
+}
index e20438b784bed4019e50080e54fd06fad2c0776e..6c3017139c67d9a55e331e9b0bf29c30124ec033 100644 (file)
@@ -1,5 +1,6 @@
 #include <linux/types.h>
 #include <sys/mman.h>
+#include <api/fs/fs.h>
 #include "event.h"
 #include "debug.h"
 #include "hist.h"
@@ -248,6 +249,10 @@ int perf_event__synthesize_mmap_events(struct perf_tool *tool,
        bool truncation = false;
        unsigned long long timeout = proc_map_timeout * 1000000ULL;
        int rc = 0;
+#ifdef MAP_HUGETLB
+       const char *hugetlbfs_mnt = hugetlbfs__mountpoint();
+       int hugetlbfs_mnt_len = hugetlbfs_mnt ? strlen(hugetlbfs_mnt) : 0;
+#endif
 
        if (machine__is_default_guest(machine))
                return 0;
@@ -342,6 +347,12 @@ out:
 
                if (!strcmp(execname, ""))
                        strcpy(execname, anonstr);
+#ifdef MAP_HUGETLB
+               if (!strncmp(execname, hugetlbfs_mnt, hugetlbfs_mnt_len)) {
+                       strcpy(execname, anonstr);
+                       event->mmap2.flags |= MAP_HUGETLB;
+               }
+#endif
 
                size = strlen(execname) + 1;
                memcpy(event->mmap2.filename, execname, size);
@@ -1286,7 +1297,7 @@ try_again:
                 * must be done prior to using kernel maps.
                 */
                if (load_map)
-                       map__load(al->map, machine->symbol_filter);
+                       map__load(al->map);
                al->addr = al->map->map_ip(al->map, al->addr);
        }
 }
@@ -1297,8 +1308,7 @@ void thread__find_addr_location(struct thread *thread,
 {
        thread__find_addr_map(thread, cpumode, type, addr, al);
        if (al->map != NULL)
-               al->sym = map__find_symbol(al->map, al->addr,
-                                          thread->mg->machine->symbol_filter);
+               al->sym = map__find_symbol(al->map, al->addr);
        else
                al->sym = NULL;
 }
@@ -1359,8 +1369,7 @@ int machine__resolve(struct machine *machine, struct addr_location *al,
                        al->filtered |= (1 << HIST_FILTER__DSO);
                }
 
-               al->sym = map__find_symbol(al->map, al->addr,
-                                          machine->symbol_filter);
+               al->sym = map__find_symbol(al->map, al->addr);
        }
 
        if (symbol_conf.sym_list &&
@@ -1416,5 +1425,5 @@ void thread__resolve(struct thread *thread, struct addr_location *al,
        al->sym = NULL;
 
        if (al->map)
-               al->sym = map__find_symbol(al->map, al->addr, NULL);
+               al->sym = map__find_symbol(al->map, al->addr);
 }
index 097b3ed77fddcffe39f7530d75ff5b665f9314a7..ea34c5a32c11c78a73575f6c7d666202180e2903 100644 (file)
@@ -1032,16 +1032,18 @@ perf_evlist__should_poll(struct perf_evlist *evlist __maybe_unused,
 }
 
 static int perf_evlist__mmap_per_evsel(struct perf_evlist *evlist, int idx,
-                                      struct mmap_params *mp, int cpu,
+                                      struct mmap_params *mp, int cpu_idx,
                                       int thread, int *_output, int *_output_backward)
 {
        struct perf_evsel *evsel;
        int revent;
+       int evlist_cpu = cpu_map__cpu(evlist->cpus, cpu_idx);
 
        evlist__for_each_entry(evlist, evsel) {
                struct perf_mmap *maps = evlist->mmap;
                int *output = _output;
                int fd;
+               int cpu;
 
                if (evsel->attr.write_backward) {
                        output = _output_backward;
@@ -1060,6 +1062,10 @@ static int perf_evlist__mmap_per_evsel(struct perf_evlist *evlist, int idx,
                if (evsel->system_wide && thread)
                        continue;
 
+               cpu = cpu_map__idx(evsel->cpus, evlist_cpu);
+               if (cpu == -1)
+                       continue;
+
                fd = FD(evsel, cpu, thread);
 
                if (*output == -1) {
index 3674e77ad64049dddf34e99674d72eaa77792a30..9111e0666950dd535e2f4d8cf1df9fbefe145204 100644 (file)
@@ -122,7 +122,7 @@ int sample__fprintf_callchain(struct perf_sample *sample, int left_alignment,
                        if (!node)
                                break;
 
-                       if (node->sym && node->sym->ignore)
+                       if (node->sym && node->sym->idle)
                                goto next;
 
                        printed += fprintf(fp, "%-*.*s", left_alignment, left_alignment, " ");
@@ -181,7 +181,7 @@ int sample__fprintf_sym(struct perf_sample *sample, struct addr_location *al,
        if (cursor != NULL) {
                printed += sample__fprintf_callchain(sample, left_alignment,
                                                     print_opts, cursor, fp);
-       } else if (!(al->sym && al->sym->ignore)) {
+       } else if (!(al->sym && al->sym->idle)) {
                printed += fprintf(fp, "%-*.*s", left_alignment, left_alignment, " ");
 
                if (print_ip)
index 8f0db4007282fabfe99fdcdc80de480d38e30510..85dd0db0a127995a725eade74a12e5da8569b80c 100644 (file)
@@ -828,8 +828,7 @@ static int write_group_desc(int fd, struct perf_header *h __maybe_unused,
  * default get_cpuid(): nothing gets recorded
  * actual implementation must be in arch/$(ARCH)/util/header.c
  */
-int __attribute__ ((weak)) get_cpuid(char *buffer __maybe_unused,
-                                    size_t sz __maybe_unused)
+int __weak get_cpuid(char *buffer __maybe_unused, size_t sz __maybe_unused)
 {
        return -1;
 }
index 0a1edf1ab4502f5bbc84396633ed92f8e4ce500b..a002c93fe422ff61ec1cfce66e46a2972812f3f8 100644 (file)
@@ -230,7 +230,7 @@ struct perf_hpp {
 struct perf_hpp_fmt {
        const char *name;
        int (*header)(struct perf_hpp_fmt *fmt, struct perf_hpp *hpp,
-                     struct hists *hists);
+                     struct hists *hists, int line, int *span);
        int (*width)(struct perf_hpp_fmt *fmt, struct perf_hpp *hpp,
                     struct hists *hists);
        int (*color)(struct perf_hpp_fmt *fmt, struct perf_hpp *hpp,
@@ -259,6 +259,7 @@ struct perf_hpp_list {
        struct list_head fields;
        struct list_head sorts;
 
+       int nr_header_lines;
        int need_collapse;
        int parent;
        int sym;
index 07c644ed64c4ecd9809bf322297e5a3876e83aad..43bfd8da7919a68e41ab98032221edad25f1cd9c 100644 (file)
@@ -3,6 +3,12 @@
 
 #ifdef HAVE_DWARF_SUPPORT
 const char *get_arch_regstr(unsigned int n);
+/*
+ * get_dwarf_regstr - Returns ftrace register string from DWARF regnum
+ * n: DWARF register number
+ * machine: ELF machine signature (EM_*)
+ */
+const char *get_dwarf_regstr(unsigned int n, unsigned int machine);
 #endif
 
 #ifdef HAVE_ARCH_REGS_QUERY_REGISTER_OFFSET
index 749e6f2e37ca800b6fb4ab5009ed13cfbe2ab956..f545ec1e758a722d1d462e9defd6ff1ba186145f 100644 (file)
@@ -346,7 +346,7 @@ static int intel_bts_get_next_insn(struct intel_bts_queue *btsq, u64 ip)
                goto out_put;
 
        /* Load maps to ensure dso->is_64_bit has been updated */
-       map__load(al.map, machine->symbol_filter);
+       map__load(al.map);
 
        x86_64 = al.map->dso->is_64_bit;
 
index 551ff6f640be85fef9362034a487ff5d4ea0c807..b9cc353cace211d5cff3508c779ca083b4691010 100644 (file)
@@ -477,7 +477,7 @@ static int intel_pt_walk_next_insn(struct intel_pt_insn *intel_pt_insn,
                start_ip = *ip;
 
                /* Load maps to ensure dso->is_64_bit has been updated */
-               map__load(al.map, machine->symbol_filter);
+               map__load(al.map);
 
                x86_64 = al.map->dso->is_64_bit;
 
@@ -1294,7 +1294,7 @@ static u64 intel_pt_switch_ip(struct intel_pt *pt, u64 *ptss_ip)
        if (!map)
                return 0;
 
-       if (map__load(map, machine->symbol_filter))
+       if (map__load(map))
                return 0;
 
        start = dso__first_symbol(map->dso, MAP__FUNCTION);
index 95a1acb61245983660ff0f2e0423c7403f80c4f4..9ddea5cecd94b57f9d177f94f447d89f4716a5a8 100644 (file)
@@ -29,6 +29,7 @@ int lzma_decompress_to_file(const char *input, int output_fd)
        lzma_action action = LZMA_RUN;
        lzma_stream strm   = LZMA_STREAM_INIT;
        lzma_ret ret;
+       int err = -1;
 
        u8 buf_in[BUFSIZE];
        u8 buf_out[BUFSIZE];
@@ -45,7 +46,7 @@ int lzma_decompress_to_file(const char *input, int output_fd)
        if (ret != LZMA_OK) {
                pr_err("lzma: lzma_stream_decoder failed %s (%d)\n",
                        lzma_strerror(ret), ret);
-               return -1;
+               goto err_fclose;
        }
 
        strm.next_in   = NULL;
@@ -60,7 +61,7 @@ int lzma_decompress_to_file(const char *input, int output_fd)
 
                        if (ferror(infile)) {
                                pr_err("lzma: read error: %s\n", strerror(errno));
-                               return -1;
+                               goto err_fclose;
                        }
 
                        if (feof(infile))
@@ -74,7 +75,7 @@ int lzma_decompress_to_file(const char *input, int output_fd)
 
                        if (writen(output_fd, buf_out, write_size) != write_size) {
                                pr_err("lzma: write error: %s\n", strerror(errno));
-                               return -1;
+                               goto err_fclose;
                        }
 
                        strm.next_out  = buf_out;
@@ -83,13 +84,15 @@ int lzma_decompress_to_file(const char *input, int output_fd)
 
                if (ret != LZMA_OK) {
                        if (ret == LZMA_STREAM_END)
-                               return 0;
+                               break;
 
                        pr_err("lzma: failed %s\n", lzma_strerror(ret));
-                       return -1;
+                       goto err_fclose;
                }
        }
 
+       err = 0;
+err_fclose:
        fclose(infile);
-       return 0;
+       return err;
 }
index cb6388dbdd9875a588207286946cb5bea7887767..18e4519abef2ca0a52d1fccd1e3d8c4bc7b5f346 100644 (file)
@@ -41,7 +41,6 @@ int machine__init(struct machine *machine, const char *root_dir, pid_t pid)
 
        machine->pid = pid;
 
-       machine->symbol_filter = NULL;
        machine->id_hdr_size = 0;
        machine->kptr_restrict_warned = false;
        machine->comm_exec = false;
@@ -148,7 +147,6 @@ void machines__init(struct machines *machines)
 {
        machine__init(&machines->host, "", HOST_KERNEL_ID);
        machines->guests = RB_ROOT;
-       machines->symbol_filter = NULL;
 }
 
 void machines__exit(struct machines *machines)
@@ -172,8 +170,6 @@ struct machine *machines__add(struct machines *machines, pid_t pid,
                return NULL;
        }
 
-       machine->symbol_filter = machines->symbol_filter;
-
        while (*p != NULL) {
                parent = *p;
                pos = rb_entry(parent, struct machine, rb_node);
@@ -189,21 +185,6 @@ struct machine *machines__add(struct machines *machines, pid_t pid,
        return machine;
 }
 
-void machines__set_symbol_filter(struct machines *machines,
-                                symbol_filter_t symbol_filter)
-{
-       struct rb_node *nd;
-
-       machines->symbol_filter = symbol_filter;
-       machines->host.symbol_filter = symbol_filter;
-
-       for (nd = rb_first(&machines->guests); nd; nd = rb_next(nd)) {
-               struct machine *machine = rb_entry(nd, struct machine, rb_node);
-
-               machine->symbol_filter = symbol_filter;
-       }
-}
-
 void machines__set_comm_exec(struct machines *machines, bool comm_exec)
 {
        struct rb_node *nd;
@@ -916,10 +897,10 @@ int machines__create_kernel_maps(struct machines *machines, pid_t pid)
 }
 
 int __machine__load_kallsyms(struct machine *machine, const char *filename,
-                            enum map_type type, bool no_kcore, symbol_filter_t filter)
+                            enum map_type type, bool no_kcore)
 {
        struct map *map = machine__kernel_map(machine);
-       int ret = __dso__load_kallsyms(map->dso, filename, map, no_kcore, filter);
+       int ret = __dso__load_kallsyms(map->dso, filename, map, no_kcore);
 
        if (ret > 0) {
                dso__set_loaded(map->dso, type);
@@ -935,16 +916,15 @@ int __machine__load_kallsyms(struct machine *machine, const char *filename,
 }
 
 int machine__load_kallsyms(struct machine *machine, const char *filename,
-                          enum map_type type, symbol_filter_t filter)
+                          enum map_type type)
 {
-       return __machine__load_kallsyms(machine, filename, type, false, filter);
+       return __machine__load_kallsyms(machine, filename, type, false);
 }
 
-int machine__load_vmlinux_path(struct machine *machine, enum map_type type,
-                              symbol_filter_t filter)
+int machine__load_vmlinux_path(struct machine *machine, enum map_type type)
 {
        struct map *map = machine__kernel_map(machine);
-       int ret = dso__load_vmlinux_path(map->dso, map, filter);
+       int ret = dso__load_vmlinux_path(map->dso, map);
 
        if (ret > 0)
                dso__set_loaded(map->dso, type);
@@ -1313,7 +1293,7 @@ static int machine__process_kernel_mmap_event(struct machine *machine,
                        /*
                         * preload dso of guest kernel and modules
                         */
-                       dso__load(kernel, machine__kernel_map(machine), NULL);
+                       dso__load(kernel, machine__kernel_map(machine));
                }
        }
        return 0;
@@ -2115,7 +2095,7 @@ int machine__get_kernel_start(struct machine *machine)
         */
        machine->kernel_start = 1ULL << 63;
        if (map) {
-               err = map__load(map, machine->symbol_filter);
+               err = map__load(map);
                if (map->start)
                        machine->kernel_start = map->start;
        }
@@ -2131,7 +2111,7 @@ char *machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, ch
 {
        struct machine *machine = vmachine;
        struct map *map;
-       struct symbol *sym = map_groups__find_symbol(&machine->kmaps, MAP__FUNCTION, *addrp, &map,  NULL);
+       struct symbol *sym = map_groups__find_symbol(&machine->kmaps, MAP__FUNCTION, *addrp, &map);
 
        if (sym == NULL)
                return NULL;
index 20739f746bc4e583cbeaf9e20673af115b3256a4..354de6e56109aa7f5ecc770ad186332d818d42c6 100644 (file)
@@ -41,7 +41,6 @@ struct machine {
        struct map_groups kmaps;
        struct map        *vmlinux_maps[MAP__NR_TYPES];
        u64               kernel_start;
-       symbol_filter_t   symbol_filter;
        pid_t             *current_tid;
        union { /* Tool specific area */
                void      *priv;
@@ -110,7 +109,6 @@ typedef void (*machine__process_t)(struct machine *machine, void *data);
 struct machines {
        struct machine host;
        struct rb_root guests;
-       symbol_filter_t symbol_filter;
 };
 
 void machines__init(struct machines *machines);
@@ -128,8 +126,6 @@ struct machine *machines__findnew(struct machines *machines, pid_t pid);
 void machines__set_id_hdr_size(struct machines *machines, u16 id_hdr_size);
 char *machine__mmap_name(struct machine *machine, char *bf, size_t size);
 
-void machines__set_symbol_filter(struct machines *machines,
-                                symbol_filter_t symbol_filter);
 void machines__set_comm_exec(struct machines *machines, bool comm_exec);
 
 struct machine *machine__new_host(void);
@@ -178,40 +174,33 @@ size_t machine__fprintf(struct machine *machine, FILE *fp);
 static inline
 struct symbol *machine__find_kernel_symbol(struct machine *machine,
                                           enum map_type type, u64 addr,
-                                          struct map **mapp,
-                                          symbol_filter_t filter)
+                                          struct map **mapp)
 {
-       return map_groups__find_symbol(&machine->kmaps, type, addr,
-                                      mapp, filter);
+       return map_groups__find_symbol(&machine->kmaps, type, addr, mapp);
 }
 
 static inline
 struct symbol *machine__find_kernel_symbol_by_name(struct machine *machine,
                                                   enum map_type type, const char *name,
-                                                  struct map **mapp,
-                                                  symbol_filter_t filter)
+                                                  struct map **mapp)
 {
-       return map_groups__find_symbol_by_name(&machine->kmaps, type, name,
-                                              mapp, filter);
+       return map_groups__find_symbol_by_name(&machine->kmaps, type, name, mapp);
 }
 
 static inline
 struct symbol *machine__find_kernel_function(struct machine *machine, u64 addr,
-                                            struct map **mapp,
-                                            symbol_filter_t filter)
+                                            struct map **mapp)
 {
        return machine__find_kernel_symbol(machine, MAP__FUNCTION, addr,
-                                          mapp, filter);
+                                          mapp);
 }
 
 static inline
 struct symbol *machine__find_kernel_function_by_name(struct machine *machine,
                                                     const char *name,
-                                                    struct map **mapp,
-                                                    symbol_filter_t filter)
+                                                    struct map **mapp)
 {
-       return map_groups__find_function_by_name(&machine->kmaps, name, mapp,
-                                                filter);
+       return map_groups__find_function_by_name(&machine->kmaps, name, mapp);
 }
 
 struct map *machine__findnew_module_map(struct machine *machine, u64 start,
@@ -219,11 +208,10 @@ struct map *machine__findnew_module_map(struct machine *machine, u64 start,
 int arch__fix_module_text_start(u64 *start, const char *name);
 
 int __machine__load_kallsyms(struct machine *machine, const char *filename,
-                            enum map_type type, bool no_kcore, symbol_filter_t filter);
+                            enum map_type type, bool no_kcore);
 int machine__load_kallsyms(struct machine *machine, const char *filename,
-                          enum map_type type, symbol_filter_t filter);
-int machine__load_vmlinux_path(struct machine *machine, enum map_type type,
-                              symbol_filter_t filter);
+                          enum map_type type);
+int machine__load_vmlinux_path(struct machine *machine, enum map_type type);
 
 size_t machine__fprintf_dsos_buildid(struct machine *machine, FILE *fp,
                                     bool (skip)(struct dso *dso, int parm), int parm);
index 728129ac653a7e371fae4ebf13c67bd7e4d3de58..d51a1257973b8ad50a5074a60d08b8096b1a602e 100644 (file)
@@ -6,6 +6,7 @@
 #include <string.h>
 #include <stdio.h>
 #include <unistd.h>
+#include <sys/mman.h>
 #include "map.h"
 #include "thread.h"
 #include "strlist.h"
@@ -24,9 +25,15 @@ const char *map_type__name[MAP__NR_TYPES] = {
        [MAP__VARIABLE] = "Variables",
 };
 
-static inline int is_anon_memory(const char *filename)
+static inline int is_anon_memory(const char *filename, u32 flags)
 {
-       return !strcmp(filename, "//anon") ||
+       u32 anon_flags = 0;
+
+#ifdef MAP_HUGETLB
+       anon_flags |= MAP_HUGETLB;
+#endif
+       return flags & anon_flags ||
+              !strcmp(filename, "//anon") ||
               !strncmp(filename, "/dev/zero", sizeof("/dev/zero") - 1) ||
               !strncmp(filename, "/anon_hugepage", sizeof("/anon_hugepage") - 1);
 }
@@ -155,7 +162,7 @@ struct map *map__new(struct machine *machine, u64 start, u64 len,
                int anon, no_dso, vdso, android;
 
                android = is_android_lib(filename);
-               anon = is_anon_memory(filename);
+               anon = is_anon_memory(filename, flags);
                vdso = is_vdso_map(filename);
                no_dso = is_no_dso_memory(filename);
 
@@ -279,7 +286,7 @@ void map__fixup_end(struct map *map)
 
 #define DSO__DELETED "(deleted)"
 
-int map__load(struct map *map, symbol_filter_t filter)
+int map__load(struct map *map)
 {
        const char *name = map->dso->long_name;
        int nr;
@@ -287,7 +294,7 @@ int map__load(struct map *map, symbol_filter_t filter)
        if (dso__loaded(map->dso, map->type))
                return 0;
 
-       nr = dso__load(map->dso, map, filter);
+       nr = dso__load(map->dso, map);
        if (nr < 0) {
                if (map->dso->has_build_id) {
                        char sbuild_id[SBUILD_ID_SIZE];
@@ -312,9 +319,6 @@ int map__load(struct map *map, symbol_filter_t filter)
                        pr_warning("%.*s was updated (is prelink enabled?). "
                                "Restart the long running apps that use it!\n",
                                   (int)real_len, name);
-               } else if (filter) {
-                       pr_warning("no symbols passed the given filter.\n");
-                       return -2;      /* Empty but maybe by the filter */
                } else {
                        pr_warning("no symbols found in %s, maybe install "
                                   "a debug package?\n", name);
@@ -331,19 +335,17 @@ int __weak arch__compare_symbol_names(const char *namea, const char *nameb)
        return strcmp(namea, nameb);
 }
 
-struct symbol *map__find_symbol(struct map *map, u64 addr,
-                               symbol_filter_t filter)
+struct symbol *map__find_symbol(struct map *map, u64 addr)
 {
-       if (map__load(map, filter) < 0)
+       if (map__load(map) < 0)
                return NULL;
 
        return dso__find_symbol(map->dso, map->type, addr);
 }
 
-struct symbol *map__find_symbol_by_name(struct map *map, const char *name,
-                                       symbol_filter_t filter)
+struct symbol *map__find_symbol_by_name(struct map *map, const char *name)
 {
-       if (map__load(map, filter) < 0)
+       if (map__load(map) < 0)
                return NULL;
 
        if (!dso__sorted_by_name(map->dso, map->type))
@@ -556,23 +558,22 @@ void map_groups__put(struct map_groups *mg)
 
 struct symbol *map_groups__find_symbol(struct map_groups *mg,
                                       enum map_type type, u64 addr,
-                                      struct map **mapp,
-                                      symbol_filter_t filter)
+                                      struct map **mapp)
 {
        struct map *map = map_groups__find(mg, type, addr);
 
        /* Ensure map is loaded before using map->map_ip */
-       if (map != NULL && map__load(map, filter) >= 0) {
+       if (map != NULL && map__load(map) >= 0) {
                if (mapp != NULL)
                        *mapp = map;
-               return map__find_symbol(map, map->map_ip(map, addr), filter);
+               return map__find_symbol(map, map->map_ip(map, addr));
        }
 
        return NULL;
 }
 
 struct symbol *maps__find_symbol_by_name(struct maps *maps, const char *name,
-                                        struct map **mapp, symbol_filter_t filter)
+                                        struct map **mapp)
 {
        struct symbol *sym;
        struct rb_node *nd;
@@ -582,7 +583,7 @@ struct symbol *maps__find_symbol_by_name(struct maps *maps, const char *name,
        for (nd = rb_first(&maps->entries); nd; nd = rb_next(nd)) {
                struct map *pos = rb_entry(nd, struct map, rb_node);
 
-               sym = map__find_symbol_by_name(pos, name, filter);
+               sym = map__find_symbol_by_name(pos, name);
 
                if (sym == NULL)
                        continue;
@@ -600,15 +601,14 @@ out:
 struct symbol *map_groups__find_symbol_by_name(struct map_groups *mg,
                                               enum map_type type,
                                               const char *name,
-                                              struct map **mapp,
-                                              symbol_filter_t filter)
+                                              struct map **mapp)
 {
-       struct symbol *sym = maps__find_symbol_by_name(&mg->maps[type], name, mapp, filter);
+       struct symbol *sym = maps__find_symbol_by_name(&mg->maps[type], name, mapp);
 
        return sym;
 }
 
-int map_groups__find_ams(struct addr_map_symbol *ams, symbol_filter_t filter)
+int map_groups__find_ams(struct addr_map_symbol *ams)
 {
        if (ams->addr < ams->map->start || ams->addr >= ams->map->end) {
                if (ams->map->groups == NULL)
@@ -620,7 +620,7 @@ int map_groups__find_ams(struct addr_map_symbol *ams, symbol_filter_t filter)
        }
 
        ams->al_addr = ams->map->map_ip(ams->map, ams->addr);
-       ams->sym = map__find_symbol(ams->map, ams->al_addr, filter);
+       ams->sym = map__find_symbol(ams->map, ams->al_addr);
 
        return ams->sym ? 0 : -1;
 }
index d83396ceecba686450f3c398593b1824a8f8efbd..abdacf800c98a78148565f23b7d9b4c916cbacaf 100644 (file)
@@ -127,17 +127,14 @@ struct thread;
  * @map: the 'struct map *' in which symbols itereated
  * @sym_name: the symbol name
  * @pos: the 'struct symbol *' to use as a loop cursor
- * @filter: to use when loading the DSO
  */
-#define __map__for_each_symbol_by_name(map, sym_name, pos, filter)     \
-       for (pos = map__find_symbol_by_name(map, sym_name, filter);     \
+#define __map__for_each_symbol_by_name(map, sym_name, pos)     \
+       for (pos = map__find_symbol_by_name(map, sym_name);     \
             pos && arch__compare_symbol_names(pos->name, sym_name) == 0;       \
             pos = symbol__next_by_name(pos))
 
 #define map__for_each_symbol_by_name(map, sym_name, pos)               \
-       __map__for_each_symbol_by_name(map, sym_name, (pos), NULL)
-
-typedef int (*symbol_filter_t)(struct map *map, struct symbol *sym);
+       __map__for_each_symbol_by_name(map, sym_name, (pos))
 
 int arch__compare_symbol_names(const char *namea, const char *nameb);
 void map__init(struct map *map, enum map_type type,
@@ -173,11 +170,9 @@ size_t map__fprintf_dsoname(struct map *map, FILE *fp);
 int map__fprintf_srcline(struct map *map, u64 addr, const char *prefix,
                         FILE *fp);
 
-int map__load(struct map *map, symbol_filter_t filter);
-struct symbol *map__find_symbol(struct map *map,
-                               u64 addr, symbol_filter_t filter);
-struct symbol *map__find_symbol_by_name(struct map *map, const char *name,
-                                       symbol_filter_t filter);
+int map__load(struct map *map);
+struct symbol *map__find_symbol(struct map *map, u64 addr);
+struct symbol *map__find_symbol_by_name(struct map *map, const char *name);
 void map__fixup_start(struct map *map);
 void map__fixup_end(struct map *map);
 
@@ -191,7 +186,7 @@ struct map *maps__find(struct maps *maps, u64 addr);
 struct map *maps__first(struct maps *maps);
 struct map *map__next(struct map *map);
 struct symbol *maps__find_symbol_by_name(struct maps *maps, const char *name,
-                                         struct map **mapp, symbol_filter_t filter);
+                                         struct map **mapp);
 void map_groups__init(struct map_groups *mg, struct machine *machine);
 void map_groups__exit(struct map_groups *mg);
 int map_groups__clone(struct thread *thread,
@@ -231,25 +226,22 @@ static inline struct map *map_groups__next(struct map *map)
 
 struct symbol *map_groups__find_symbol(struct map_groups *mg,
                                       enum map_type type, u64 addr,
-                                      struct map **mapp,
-                                      symbol_filter_t filter);
+                                      struct map **mapp);
 
 struct symbol *map_groups__find_symbol_by_name(struct map_groups *mg,
                                               enum map_type type,
                                               const char *name,
-                                              struct map **mapp,
-                                              symbol_filter_t filter);
+                                              struct map **mapp);
 
 struct addr_map_symbol;
 
-int map_groups__find_ams(struct addr_map_symbol *ams, symbol_filter_t filter);
+int map_groups__find_ams(struct addr_map_symbol *ams);
 
 static inline
 struct symbol *map_groups__find_function_by_name(struct map_groups *mg,
-                                                const char *name, struct map **mapp,
-                                                symbol_filter_t filter)
+                                                const char *name, struct map **mapp)
 {
-       return map_groups__find_symbol_by_name(mg, MAP__FUNCTION, name, mapp, filter);
+       return map_groups__find_symbol_by_name(mg, MAP__FUNCTION, name, mapp);
 }
 
 int map_groups__fixup_overlappings(struct map_groups *mg, struct map *map,
index ddb0261b2577a2caf5fb532f7c343bd17cb9c972..2babcdf6283983b6cef131cca0e7bfb7df68cc06 100644 (file)
@@ -445,14 +445,23 @@ static struct cpu_map *pmu_cpumask(const char *name)
        FILE *file;
        struct cpu_map *cpus;
        const char *sysfs = sysfs__mountpoint();
+       const char *templates[] = {
+                "%s/bus/event_source/devices/%s/cpumask",
+                "%s/bus/event_source/devices/%s/cpus",
+                NULL
+       };
+       const char **template;
 
        if (!sysfs)
                return NULL;
 
-       snprintf(path, PATH_MAX,
-                "%s/bus/event_source/devices/%s/cpumask", sysfs, name);
+       for (template = templates; *template; template++) {
+               snprintf(path, PATH_MAX, *template, sysfs, name);
+               if (stat(path, &st) == 0)
+                       break;
+       }
 
-       if (stat(path, &st) < 0)
+       if (!*template)
                return NULL;
 
        file = fopen(path, "r");
index 28733962cd80a63e1376b5b71f4771ab8f8857f7..bc60ce49720ba1fae3ee77adf51e9b33e2caa00f 100644 (file)
@@ -110,13 +110,12 @@ void exit_probe_symbol_maps(void)
 static struct symbol *__find_kernel_function_by_name(const char *name,
                                                     struct map **mapp)
 {
-       return machine__find_kernel_function_by_name(host_machine, name, mapp,
-                                                    NULL);
+       return machine__find_kernel_function_by_name(host_machine, name, mapp);
 }
 
 static struct symbol *__find_kernel_function(u64 addr, struct map **mapp)
 {
-       return machine__find_kernel_function(host_machine, addr, mapp, NULL);
+       return machine__find_kernel_function(host_machine, addr, mapp);
 }
 
 static struct ref_reloc_sym *kernel_get_ref_reloc_sym(void)
@@ -125,7 +124,7 @@ static struct ref_reloc_sym *kernel_get_ref_reloc_sym(void)
        struct kmap *kmap;
        struct map *map = machine__kernel_map(host_machine);
 
-       if (map__load(map, NULL) < 0)
+       if (map__load(map) < 0)
                return NULL;
 
        kmap = map__kmap(map);
@@ -351,9 +350,9 @@ static int kernel_get_module_dso(const char *module, struct dso **pdso)
        vmlinux_name = symbol_conf.vmlinux_name;
        dso->load_errno = 0;
        if (vmlinux_name)
-               ret = dso__load_vmlinux(dso, map, vmlinux_name, false, NULL);
+               ret = dso__load_vmlinux(dso, map, vmlinux_name, false);
        else
-               ret = dso__load_vmlinux_path(dso, map, NULL);
+               ret = dso__load_vmlinux_path(dso, map);
 found:
        *pdso = dso;
        return ret;
@@ -674,6 +673,10 @@ post_process_kernel_probe_trace_events(struct probe_trace_event *tevs,
        char *tmp;
        int i, skipped = 0;
 
+       /* Skip post process if the target is an offline kernel */
+       if (symbol_conf.ignore_vmlinux_buildid)
+               return 0;
+
        reloc_sym = kernel_get_ref_reloc_sym();
        if (!reloc_sym) {
                pr_warning("Relocated base symbol is not found!\n");
@@ -1614,19 +1617,27 @@ out:
        return ret;
 }
 
+/* Returns true if *any* ARG is either C variable, $params or $vars. */
+bool perf_probe_with_var(struct perf_probe_event *pev)
+{
+       int i = 0;
+
+       for (i = 0; i < pev->nargs; i++)
+               if (is_c_varname(pev->args[i].var)              ||
+                   !strcmp(pev->args[i].var, PROBE_ARG_PARAMS) ||
+                   !strcmp(pev->args[i].var, PROBE_ARG_VARS))
+                       return true;
+       return false;
+}
+
 /* Return true if this perf_probe_event requires debuginfo */
 bool perf_probe_event_need_dwarf(struct perf_probe_event *pev)
 {
-       int i;
-
        if (pev->point.file || pev->point.line || pev->point.lazy_line)
                return true;
 
-       for (i = 0; i < pev->nargs; i++)
-               if (is_c_varname(pev->args[i].var) ||
-                   !strcmp(pev->args[i].var, "$params") ||
-                   !strcmp(pev->args[i].var, "$vars"))
-                       return true;
+       if (perf_probe_with_var(pev))
+               return true;
 
        return false;
 }
@@ -1987,7 +1998,7 @@ static int find_perf_probe_point_from_map(struct probe_trace_point *tp,
                map = dso__new_map(tp->module);
                if (!map)
                        goto out;
-               sym = map__find_symbol(map, addr, NULL);
+               sym = map__find_symbol(map, addr);
        } else {
                if (tp->symbol && !addr) {
                        if (kernel_get_symbol_address_by_name(tp->symbol,
@@ -2692,7 +2703,7 @@ static int find_probe_functions(struct map *map, char *name,
        struct symbol *sym;
        struct rb_node *tmp;
 
-       if (map__load(map, NULL) < 0)
+       if (map__load(map) < 0)
                return 0;
 
        map__for_each_symbol(map, sym, tmp) {
@@ -3207,6 +3218,52 @@ int convert_perf_probe_events(struct perf_probe_event *pevs, int npevs)
        return 0;
 }
 
+static int show_probe_trace_event(struct probe_trace_event *tev)
+{
+       char *buf = synthesize_probe_trace_command(tev);
+
+       if (!buf) {
+               pr_debug("Failed to synthesize probe trace event.\n");
+               return -EINVAL;
+       }
+
+       /* Showing definition always go stdout */
+       printf("%s\n", buf);
+       free(buf);
+
+       return 0;
+}
+
+int show_probe_trace_events(struct perf_probe_event *pevs, int npevs)
+{
+       struct strlist *namelist = strlist__new(NULL, NULL);
+       struct probe_trace_event *tev;
+       struct perf_probe_event *pev;
+       int i, j, ret = 0;
+
+       if (!namelist)
+               return -ENOMEM;
+
+       for (j = 0; j < npevs && !ret; j++) {
+               pev = &pevs[j];
+               for (i = 0; i < pev->ntevs && !ret; i++) {
+                       tev = &pev->tevs[i];
+                       /* Skip if the symbol is out of .text or blacklisted */
+                       if (!tev->point.symbol && !pev->uprobes)
+                               continue;
+
+                       /* Set new name for tev (and update namelist) */
+                       ret = probe_trace_event__set_name(tev, pev,
+                                                         namelist, true);
+                       if (!ret)
+                               ret = show_probe_trace_event(tev);
+               }
+       }
+       strlist__delete(namelist);
+
+       return ret;
+}
+
 int apply_perf_probe_events(struct perf_probe_event *pevs, int npevs)
 {
        int i, ret = 0;
@@ -3289,24 +3346,10 @@ out:
        return ret;
 }
 
-/* TODO: don't use a global variable for filter ... */
-static struct strfilter *available_func_filter;
-
-/*
- * If a symbol corresponds to a function with global binding and
- * matches filter return 0. For all others return 1.
- */
-static int filter_available_functions(struct map *map __maybe_unused,
-                                     struct symbol *sym)
-{
-       if (strfilter__compare(available_func_filter, sym->name))
-               return 0;
-       return 1;
-}
-
 int show_available_funcs(const char *target, struct strfilter *_filter,
                                        bool user)
 {
+        struct rb_node *nd;
        struct map *map;
        int ret;
 
@@ -3324,9 +3367,7 @@ int show_available_funcs(const char *target, struct strfilter *_filter,
                return -EINVAL;
        }
 
-       /* Load symbols with given filter */
-       available_func_filter = _filter;
-       ret = map__load(map, filter_available_functions);
+       ret = map__load(map);
        if (ret) {
                if (ret == -2) {
                        char *str = strfilter__string(_filter);
@@ -3343,7 +3384,14 @@ int show_available_funcs(const char *target, struct strfilter *_filter,
 
        /* Show all (filtered) symbols */
        setup_pager();
-       dso__fprintf_symbols_by_name(map->dso, map->type, stdout);
+
+        for (nd = rb_first(&map->dso->symbol_names[map->type]); nd; nd = rb_next(nd)) {
+               struct symbol_name_rb_node *pos = rb_entry(nd, struct symbol_name_rb_node, rb_node);
+
+               if (strfilter__compare(_filter, pos->sym.name))
+                       printf("%s\n", pos->sym.name);
+        }
+
 end:
        if (user) {
                map__put(map);
index f4f45db77c1c1ec59c3ee505f525f2b2561530ee..8091d15113f7a465437e8aa2e7d731583428fedd 100644 (file)
@@ -128,6 +128,8 @@ char *synthesize_perf_probe_point(struct perf_probe_point *pp);
 int perf_probe_event__copy(struct perf_probe_event *dst,
                           struct perf_probe_event *src);
 
+bool perf_probe_with_var(struct perf_probe_event *pev);
+
 /* Check the perf_probe_event needs debuginfo */
 bool perf_probe_event_need_dwarf(struct perf_probe_event *pev);
 
@@ -147,6 +149,7 @@ int line_range__init(struct line_range *lr);
 int add_perf_probe_events(struct perf_probe_event *pevs, int npevs);
 int convert_perf_probe_events(struct perf_probe_event *pevs, int npevs);
 int apply_perf_probe_events(struct perf_probe_event *pevs, int npevs);
+int show_probe_trace_events(struct perf_probe_event *pevs, int npevs);
 void cleanup_perf_probe_events(struct perf_probe_event *pevs, int npevs);
 int del_perf_probe_events(struct strfilter *filter);
 
index 9c3b9ed5b3c3ec68ee175b6bec91d1c90211798b..6f931e442f14ab157fed25fdb7ab282932943f8b 100644 (file)
@@ -73,11 +73,10 @@ static void print_both_open_warning(int kerr, int uerr)
 static int open_probe_events(const char *trace_file, bool readwrite)
 {
        char buf[PATH_MAX];
-       const char *tracing_dir = "";
        int ret;
 
-       ret = e_snprintf(buf, PATH_MAX, "%s/%s%s",
-                        tracing_path, tracing_dir, trace_file);
+       ret = e_snprintf(buf, PATH_MAX, "%s/%s",
+                        tracing_path, trace_file);
        if (ret >= 0) {
                pr_debug("Opening %s write=%d\n", buf, readwrite);
                if (readwrite && !probe_event_dry_run)
@@ -877,3 +876,60 @@ int probe_cache__show_all_caches(struct strfilter *filter)
 
        return 0;
 }
+
+static struct {
+       const char *pattern;
+       bool    avail;
+       bool    checked;
+} probe_type_table[] = {
+#define DEFINE_TYPE(idx, pat, def_avail)       \
+       [idx] = {.pattern = pat, .avail = (def_avail)}
+       DEFINE_TYPE(PROBE_TYPE_U, "* u8/16/32/64,*", true),
+       DEFINE_TYPE(PROBE_TYPE_S, "* s8/16/32/64,*", true),
+       DEFINE_TYPE(PROBE_TYPE_X, "* x8/16/32/64,*", false),
+       DEFINE_TYPE(PROBE_TYPE_STRING, "* string,*", true),
+       DEFINE_TYPE(PROBE_TYPE_BITFIELD,
+                   "* b<bit-width>@<bit-offset>/<container-size>", true),
+};
+
+bool probe_type_is_available(enum probe_type type)
+{
+       FILE *fp;
+       char *buf = NULL;
+       size_t len = 0;
+       bool target_line = false;
+       bool ret = probe_type_table[type].avail;
+
+       if (type >= PROBE_TYPE_END)
+               return false;
+       /* We don't have to check the type which supported by default */
+       if (ret || probe_type_table[type].checked)
+               return ret;
+
+       if (asprintf(&buf, "%s/README", tracing_path) < 0)
+               return ret;
+
+       fp = fopen(buf, "r");
+       if (!fp)
+               goto end;
+
+       zfree(&buf);
+       while (getline(&buf, &len, fp) > 0 && !ret) {
+               if (!target_line) {
+                       target_line = !!strstr(buf, " type: ");
+                       if (!target_line)
+                               continue;
+               } else if (strstr(buf, "\t          ") != buf)
+                       break;
+               ret = strglobmatch(buf, probe_type_table[type].pattern);
+       }
+       /* Cache the result */
+       probe_type_table[type].checked = true;
+       probe_type_table[type].avail = ret;
+
+       fclose(fp);
+end:
+       free(buf);
+
+       return ret;
+}
index 9577b5c0b487ee6adb017555521ad6dcd497a621..eba44c3e9dca9ac670d06736237feb49fb21734b 100644 (file)
@@ -19,6 +19,15 @@ struct probe_cache {
        struct list_head entries;
 };
 
+enum probe_type {
+       PROBE_TYPE_U = 0,
+       PROBE_TYPE_S,
+       PROBE_TYPE_X,
+       PROBE_TYPE_STRING,
+       PROBE_TYPE_BITFIELD,
+       PROBE_TYPE_END,
+};
+
 #define PF_FL_UPROBE   1
 #define PF_FL_RW       2
 #define for_each_probe_cache_entry(entry, pcache) \
@@ -54,6 +63,7 @@ struct probe_cache_entry *probe_cache__find(struct probe_cache *pcache,
 struct probe_cache_entry *probe_cache__find_by_name(struct probe_cache *pcache,
                                        const char *group, const char *event);
 int probe_cache__show_all_caches(struct strfilter *filter);
+bool probe_type_is_available(enum probe_type type);
 #else  /* ! HAVE_LIBELF_SUPPORT */
 static inline struct probe_cache *probe_cache__new(const char *tgt __maybe_unused)
 {
index 5c290c682afe7176607fe01f4d29742b1821f1a9..8daca4fc1f8d2189cd0450157968901d4ff8e760 100644 (file)
@@ -39,6 +39,7 @@
 #include "util.h"
 #include "symbol.h"
 #include "probe-finder.h"
+#include "probe-file.h"
 
 /* Kprobe tracer basic type is up to u64 */
 #define MAX_BASIC_TYPE_BITS    64
@@ -170,6 +171,7 @@ static struct probe_trace_arg_ref *alloc_trace_arg_ref(long offs)
  */
 static int convert_variable_location(Dwarf_Die *vr_die, Dwarf_Addr addr,
                                     Dwarf_Op *fb_ops, Dwarf_Die *sp_die,
+                                    unsigned int machine,
                                     struct probe_trace_arg *tvar)
 {
        Dwarf_Attribute attr;
@@ -265,7 +267,7 @@ static_var:
        if (!tvar)
                return ret2;
 
-       regs = get_arch_regstr(regn);
+       regs = get_dwarf_regstr(regn, machine);
        if (!regs) {
                /* This should be a bug in DWARF or this tool */
                pr_warning("Mapping for the register number %u "
@@ -297,13 +299,13 @@ static int convert_variable_type(Dwarf_Die *vr_die,
        char sbuf[STRERR_BUFSIZE];
        int bsize, boffs, total;
        int ret;
-       char sign;
+       char prefix;
 
        /* TODO: check all types */
-       if (cast && strcmp(cast, "string") != 0 &&
+       if (cast && strcmp(cast, "string") != 0 && strcmp(cast, "x") != 0 &&
            strcmp(cast, "s") != 0 && strcmp(cast, "u") != 0) {
                /* Non string type is OK */
-               /* and respect signedness cast */
+               /* and respect signedness/hexadecimal cast */
                tvar->type = strdup(cast);
                return (tvar->type == NULL) ? -ENOMEM : 0;
        }
@@ -365,11 +367,15 @@ static int convert_variable_type(Dwarf_Die *vr_die,
        }
 
        if (cast && (strcmp(cast, "u") == 0))
-               sign = 'u';
+               prefix = 'u';
        else if (cast && (strcmp(cast, "s") == 0))
-               sign = 's';
+               prefix = 's';
+       else if (cast && (strcmp(cast, "x") == 0) &&
+                probe_type_is_available(PROBE_TYPE_X))
+               prefix = 'x';
        else
-               sign = die_is_signed_type(&type) ? 's' : 'u';
+               prefix = die_is_signed_type(&type) ? 's' :
+                        probe_type_is_available(PROBE_TYPE_X) ? 'x' : 'u';
 
        ret = dwarf_bytesize(&type);
        if (ret <= 0)
@@ -383,7 +389,7 @@ static int convert_variable_type(Dwarf_Die *vr_die,
                        dwarf_diename(&type), MAX_BASIC_TYPE_BITS);
                ret = MAX_BASIC_TYPE_BITS;
        }
-       ret = snprintf(buf, 16, "%c%d", sign, ret);
+       ret = snprintf(buf, 16, "%c%d", prefix, ret);
 
 formatted:
        if (ret < 0 || ret >= 16) {
@@ -538,7 +544,7 @@ static int convert_variable(Dwarf_Die *vr_die, struct probe_finder *pf)
                 dwarf_diename(vr_die));
 
        ret = convert_variable_location(vr_die, pf->addr, pf->fb_ops,
-                                       &pf->sp_die, pf->tvar);
+                                       &pf->sp_die, pf->machine, pf->tvar);
        if (ret == -ENOENT || ret == -EINVAL) {
                pr_err("Failed to find the location of the '%s' variable at this address.\n"
                       " Perhaps it has been optimized out.\n"
@@ -901,6 +907,38 @@ static int find_probe_point_lazy(Dwarf_Die *sp_die, struct probe_finder *pf)
        return die_walk_lines(sp_die, probe_point_lazy_walker, pf);
 }
 
+static void skip_prologue(Dwarf_Die *sp_die, struct probe_finder *pf)
+{
+       struct perf_probe_point *pp = &pf->pev->point;
+
+       /* Not uprobe? */
+       if (!pf->pev->uprobes)
+               return;
+
+       /* Compiled with optimization? */
+       if (die_is_optimized_target(&pf->cu_die))
+               return;
+
+       /* Don't know entrypc? */
+       if (!pf->addr)
+               return;
+
+       /* Only FUNC and FUNC@SRC are eligible. */
+       if (!pp->function || pp->line || pp->retprobe || pp->lazy_line ||
+           pp->offset || pp->abs_address)
+               return;
+
+       /* Not interested in func parameter? */
+       if (!perf_probe_with_var(pf->pev))
+               return;
+
+       pr_info("Target program is compiled without optimization. Skipping prologue.\n"
+               "Probe on address 0x%" PRIx64 " to force probing at the function entry.\n\n",
+               pf->addr);
+
+       die_skip_prologue(sp_die, &pf->cu_die, &pf->addr);
+}
+
 static int probe_point_inline_cb(Dwarf_Die *in_die, void *data)
 {
        struct probe_finder *pf = data;
@@ -963,6 +1001,7 @@ static int probe_point_search_cb(Dwarf_Die *sp_die, void *data)
                if (pp->lazy_line)
                        param->retval = find_probe_point_lazy(sp_die, pf);
                else {
+                       skip_prologue(sp_die, pf);
                        pf->addr += pp->offset;
                        /* TODO: Check the address in this function */
                        param->retval = call_probe_finder(sp_die, pf);
@@ -1101,11 +1140,8 @@ static int debuginfo__find_probes(struct debuginfo *dbg,
                                  struct probe_finder *pf)
 {
        int ret = 0;
-
-#if _ELFUTILS_PREREQ(0, 142)
        Elf *elf;
        GElf_Ehdr ehdr;
-       GElf_Shdr shdr;
 
        if (pf->cfi_eh || pf->cfi_dbg)
                return debuginfo__find_probe_location(dbg, pf);
@@ -1118,11 +1154,18 @@ static int debuginfo__find_probes(struct debuginfo *dbg,
        if (gelf_getehdr(elf, &ehdr) == NULL)
                return -EINVAL;
 
-       if (elf_section_by_name(elf, &ehdr, &shdr, ".eh_frame", NULL) &&
-           shdr.sh_type == SHT_PROGBITS)
-               pf->cfi_eh = dwarf_getcfi_elf(elf);
+       pf->machine = ehdr.e_machine;
+
+#if _ELFUTILS_PREREQ(0, 142)
+       do {
+               GElf_Shdr shdr;
+
+               if (elf_section_by_name(elf, &ehdr, &shdr, ".eh_frame", NULL) &&
+                   shdr.sh_type == SHT_PROGBITS)
+                       pf->cfi_eh = dwarf_getcfi_elf(elf);
 
-       pf->cfi_dbg = dwarf_getcfi(dbg->dbg);
+               pf->cfi_dbg = dwarf_getcfi(dbg->dbg);
+       } while (0);
 #endif
 
        ret = debuginfo__find_probe_location(dbg, pf);
@@ -1150,7 +1193,7 @@ static int copy_variables_cb(Dwarf_Die *die_mem, void *data)
            (tag == DW_TAG_variable && vf->vars)) {
                if (convert_variable_location(die_mem, vf->pf->addr,
                                              vf->pf->fb_ops, &pf->sp_die,
-                                             NULL) == 0) {
+                                             pf->machine, NULL) == 0) {
                        vf->args[vf->nargs].var = (char *)dwarf_diename(die_mem);
                        if (vf->args[vf->nargs].var == NULL) {
                                vf->ret = -ENOMEM;
@@ -1313,7 +1356,7 @@ static int collect_variables_cb(Dwarf_Die *die_mem, void *data)
            tag == DW_TAG_variable) {
                ret = convert_variable_location(die_mem, af->pf.addr,
                                                af->pf.fb_ops, &af->pf.sp_die,
-                                               NULL);
+                                               af->pf.machine, NULL);
                if (ret == 0 || ret == -ERANGE) {
                        int ret2;
                        bool externs = !af->child;
index 51137fccb9c81abf96f2fc0559437f552f6254f6..f1d8558f498e96771c13b3f42046a757e888bdf2 100644 (file)
@@ -80,6 +80,7 @@ struct probe_finder {
        Dwarf_CFI               *cfi_dbg;
 #endif
        Dwarf_Op                *fb_ops;        /* Frame base attribute */
+       unsigned int            machine;        /* Target machine arch */
        struct perf_probe_arg   *pvar;          /* Current target variable */
        struct probe_trace_arg  *tvar;          /* Current result variable */
 };
index 5d1eb1ccd96c3e0cf73449c97aaee9246d1f9619..e55a132f69b73e23661bf1a6e21aaac36cbb85ad 100644 (file)
@@ -25,6 +25,7 @@
 #include <ctype.h>
 #include <errno.h>
 #include <linux/bitmap.h>
+#include <linux/time64.h>
 
 #include "../util.h"
 #include <EXTERN.h>
@@ -359,8 +360,8 @@ static void perl_process_tracepoint(struct perf_sample *sample,
        if (!test_and_set_bit(event->id, events_defined))
                define_event_symbols(event, handler, event->print_fmt.args);
 
-       s = nsecs / NSECS_PER_SEC;
-       ns = nsecs - s * NSECS_PER_SEC;
+       s = nsecs / NSEC_PER_SEC;
+       ns = nsecs - s * NSEC_PER_SEC;
 
        scripting_context->event_data = data;
        scripting_context->pevent = evsel->tp_format->pevent;
index e0203b97947483d82042c3a568f01c73878eb6a9..089438da1f7f76b2318dd2f5b10bc039441e2fc4 100644 (file)
@@ -27,6 +27,7 @@
 #include <stdbool.h>
 #include <errno.h>
 #include <linux/bitmap.h>
+#include <linux/time64.h>
 
 #include "../../perf.h"
 #include "../debug.h"
@@ -426,8 +427,8 @@ static void python_process_tracepoint(struct perf_sample *sample,
                if (!dict)
                        Py_FatalError("couldn't create Python dict");
        }
-       s = nsecs / NSECS_PER_SEC;
-       ns = nsecs - s * NSECS_PER_SEC;
+       s = nsecs / NSEC_PER_SEC;
+       ns = nsecs - s * NSEC_PER_SEC;
 
        scripting_context->event_data = data;
        scripting_context->pevent = evsel->tp_format->pevent;
index 3d3cb8392c86029bb488f737564730e0cd8995bc..1884d7f9b9d2abc516653c479ab3dfb85ecf8a70 100644 (file)
@@ -11,7 +11,7 @@
 regex_t                parent_regex;
 const char     default_parent_pattern[] = "^sys_|^do_page_fault";
 const char     *parent_pattern = default_parent_pattern;
-const char     default_sort_order[] = "comm,dso,symbol";
+const char     *default_sort_order = "comm,dso,symbol";
 const char     default_branch_sort_order[] = "comm,dso_from,symbol_from,symbol_to,cycles";
 const char     default_mem_sort_order[] = "local_weight,mem,sym,dso,symbol_daddr,dso_daddr,snoop,tlb,locked";
 const char     default_top_sort_order[] = "dso,symbol";
@@ -1492,7 +1492,8 @@ void perf_hpp__reset_sort_width(struct perf_hpp_fmt *fmt, struct hists *hists)
 }
 
 static int __sort__hpp_header(struct perf_hpp_fmt *fmt, struct perf_hpp *hpp,
-                             struct hists *hists)
+                             struct hists *hists, int line __maybe_unused,
+                             int *span __maybe_unused)
 {
        struct hpp_sort_entry *hse;
        size_t len = fmt->user_len;
@@ -1797,7 +1798,9 @@ static void update_dynamic_len(struct hpp_dynamic_entry *hde,
 }
 
 static int __sort__hde_header(struct perf_hpp_fmt *fmt, struct perf_hpp *hpp,
-                             struct hists *hists __maybe_unused)
+                             struct hists *hists __maybe_unused,
+                             int line __maybe_unused,
+                             int *span __maybe_unused)
 {
        struct hpp_dynamic_entry *hde;
        size_t len = fmt->user_len;
index 7ca37ea1739559e1e3f064c9260d5d078cb5b84f..28c0524c87026a213f6f2af95cc71bb6d30f251d 100644 (file)
@@ -28,7 +28,7 @@ extern const char *sort_order;
 extern const char *field_order;
 extern const char default_parent_pattern[];
 extern const char *parent_pattern;
-extern const char default_sort_order[];
+extern const char *default_sort_order;
 extern regex_t ignore_callees_regex;
 extern int have_ignore_callees;
 extern enum sort_mode sort__mode;
index eec6c1149f44758ebe22c37fb07cd4cd6fd0aef7..1cbada2dc6be67fc9afeb0c4298aafe17824ac3f 100644 (file)
@@ -18,6 +18,7 @@
 #include <unistd.h>
 #include <string.h>
 #include <linux/bitmap.h>
+#include <linux/time64.h>
 
 #include "perf.h"
 #include "svghelper.h"
@@ -274,14 +275,14 @@ static char *time_to_string(u64 duration)
 
        text[0] = 0;
 
-       if (duration < 1000) /* less than 1 usec */
+       if (duration < NSEC_PER_USEC) /* less than 1 usec */
                return text;
 
-       if (duration < 1000 * 1000) { /* less than 1 msec */
-               sprintf(text, "%.1f us", duration / 1000.0);
+       if (duration < NSEC_PER_MSEC) { /* less than 1 msec */
+               sprintf(text, "%.1f us", duration / (double)NSEC_PER_USEC);
                return text;
        }
-       sprintf(text, "%.1f ms", duration / 1000.0 / 1000);
+       sprintf(text, "%.1f ms", duration / (double)NSEC_PER_MSEC);
 
        return text;
 }
@@ -297,7 +298,7 @@ void svg_waiting(int Yslot, int cpu, u64 start, u64 end, const char *backtrace)
 
        style = "waiting";
 
-       if (end-start > 10 * 1000000) /* 10 msec */
+       if (end-start > 10 * NSEC_PER_MSEC) /* 10 msec */
                style = "WAITING";
 
        text = time_to_string(end-start);
index a811c13a74d663ac40efdf03333145450e7d7c19..99400b0e8f2a892dcfbfe2572758ef3eae73be59 100644 (file)
@@ -206,6 +206,37 @@ Elf_Scn *elf_section_by_name(Elf *elf, GElf_Ehdr *ep,
        return NULL;
 }
 
+static bool want_demangle(bool is_kernel_sym)
+{
+       return is_kernel_sym ? symbol_conf.demangle_kernel : symbol_conf.demangle;
+}
+
+static char *demangle_sym(struct dso *dso, int kmodule, const char *elf_name)
+{
+       int demangle_flags = verbose ? (DMGL_PARAMS | DMGL_ANSI) : DMGL_NO_OPTS;
+       char *demangled = NULL;
+
+       /*
+        * We need to figure out if the object was created from C++ sources
+        * DWARF DW_compile_unit has this, but we don't always have access
+        * to it...
+        */
+       if (!want_demangle(dso->kernel || kmodule))
+           return demangled;
+
+       demangled = bfd_demangle(NULL, elf_name, demangle_flags);
+       if (demangled == NULL)
+               demangled = java_demangle_sym(elf_name, JAVA_DEMANGLE_NORET);
+       else if (rust_is_mangled(demangled))
+               /*
+                   * Input to Rust demangling is the BFD-demangled
+                   * name which it Rust-demangles in place.
+                   */
+               rust_demangle_sym(demangled);
+
+       return demangled;
+}
+
 #define elf_section__for_each_rel(reldata, pos, pos_mem, idx, nr_entries) \
        for (idx = 0, pos = gelf_getrel(reldata, 0, &pos_mem); \
             idx < nr_entries; \
@@ -223,8 +254,7 @@ Elf_Scn *elf_section_by_name(Elf *elf, GElf_Ehdr *ep,
  * And always look at the original dso, not at debuginfo packages, that
  * have the PLT data stripped out (shdr_rel_plt.sh_type == SHT_NOBITS).
  */
-int dso__synthesize_plt_symbols(struct dso *dso, struct symsrc *ss, struct map *map,
-                               symbol_filter_t filter)
+int dso__synthesize_plt_symbols(struct dso *dso, struct symsrc *ss, struct map *map)
 {
        uint32_t nr_rel_entries, idx;
        GElf_Sym sym;
@@ -301,45 +331,53 @@ int dso__synthesize_plt_symbols(struct dso *dso, struct symsrc *ss, struct map *
 
                elf_section__for_each_rela(reldata, pos, pos_mem, idx,
                                           nr_rel_entries) {
+                       const char *elf_name = NULL;
+                       char *demangled = NULL;
                        symidx = GELF_R_SYM(pos->r_info);
                        plt_offset += shdr_plt.sh_entsize;
                        gelf_getsym(syms, symidx, &sym);
+
+                       elf_name = elf_sym__name(&sym, symstrs);
+                       demangled = demangle_sym(dso, 0, elf_name);
+                       if (demangled != NULL)
+                               elf_name = demangled;
                        snprintf(sympltname, sizeof(sympltname),
-                                "%s@plt", elf_sym__name(&sym, symstrs));
+                                "%s@plt", elf_name);
+                       free(demangled);
 
                        f = symbol__new(plt_offset, shdr_plt.sh_entsize,
                                        STB_GLOBAL, sympltname);
                        if (!f)
                                goto out_elf_end;
 
-                       if (filter && filter(map, f))
-                               symbol__delete(f);
-                       else {
-                               symbols__insert(&dso->symbols[map->type], f);
-                               ++nr;
-                       }
+                       symbols__insert(&dso->symbols[map->type], f);
+                       ++nr;
                }
        } else if (shdr_rel_plt.sh_type == SHT_REL) {
                GElf_Rel pos_mem, *pos;
                elf_section__for_each_rel(reldata, pos, pos_mem, idx,
                                          nr_rel_entries) {
+                       const char *elf_name = NULL;
+                       char *demangled = NULL;
                        symidx = GELF_R_SYM(pos->r_info);
                        plt_offset += shdr_plt.sh_entsize;
                        gelf_getsym(syms, symidx, &sym);
+
+                       elf_name = elf_sym__name(&sym, symstrs);
+                       demangled = demangle_sym(dso, 0, elf_name);
+                       if (demangled != NULL)
+                               elf_name = demangled;
                        snprintf(sympltname, sizeof(sympltname),
-                                "%s@plt", elf_sym__name(&sym, symstrs));
+                                "%s@plt", elf_name);
+                       free(demangled);
 
                        f = symbol__new(plt_offset, shdr_plt.sh_entsize,
                                        STB_GLOBAL, sympltname);
                        if (!f)
                                goto out_elf_end;
 
-                       if (filter && filter(map, f))
-                               symbol__delete(f);
-                       else {
-                               symbols__insert(&dso->symbols[map->type], f);
-                               ++nr;
-                       }
+                       symbols__insert(&dso->symbols[map->type], f);
+                       ++nr;
                }
        }
 
@@ -685,7 +723,7 @@ int symsrc__init(struct symsrc *ss, struct dso *dso, const char *name,
        }
 
        /* Always reject images with a mismatched build-id: */
-       if (dso->has_build_id) {
+       if (dso->has_build_id && !symbol_conf.ignore_vmlinux_buildid) {
                u8 build_id[BUILD_ID_SIZE];
 
                if (elf_read_build_id(elf, build_id, BUILD_ID_SIZE) < 0) {
@@ -775,17 +813,11 @@ static u64 ref_reloc(struct kmap *kmap)
        return 0;
 }
 
-static bool want_demangle(bool is_kernel_sym)
-{
-       return is_kernel_sym ? symbol_conf.demangle_kernel : symbol_conf.demangle;
-}
-
 void __weak arch__sym_update(struct symbol *s __maybe_unused,
                GElf_Sym *sym __maybe_unused) { }
 
-int dso__load_sym(struct dso *dso, struct map *map,
-                 struct symsrc *syms_ss, struct symsrc *runtime_ss,
-                 symbol_filter_t filter, int kmodule)
+int dso__load_sym(struct dso *dso, struct map *map, struct symsrc *syms_ss,
+                 struct symsrc *runtime_ss, int kmodule)
 {
        struct kmap *kmap = dso->kernel ? map__kmap(map) : NULL;
        struct map_groups *kmaps = kmap ? map__kmaps(map) : NULL;
@@ -1070,29 +1102,10 @@ int dso__load_sym(struct dso *dso, struct map *map,
                        sym.st_value -= shdr.sh_addr - shdr.sh_offset;
                }
 new_symbol:
-               /*
-                * We need to figure out if the object was created from C++ sources
-                * DWARF DW_compile_unit has this, but we don't always have access
-                * to it...
-                */
-               if (want_demangle(dso->kernel || kmodule)) {
-                       int demangle_flags = DMGL_NO_OPTS;
-                       if (verbose)
-                               demangle_flags = DMGL_PARAMS | DMGL_ANSI;
-
-                       demangled = bfd_demangle(NULL, elf_name, demangle_flags);
-                       if (demangled == NULL)
-                               demangled = java_demangle_sym(elf_name, JAVA_DEMANGLE_NORET);
-                       else if (rust_is_mangled(demangled))
-                               /*
-                                * Input to Rust demangling is the BFD-demangled
-                                * name which it Rust-demangles in place.
-                                */
-                               rust_demangle_sym(demangled);
+               demangled = demangle_sym(dso, kmodule, elf_name);
+               if (demangled != NULL)
+                       elf_name = demangled;
 
-                       if (demangled != NULL)
-                               elf_name = demangled;
-               }
                f = symbol__new(sym.st_value, sym.st_size,
                                GELF_ST_BIND(sym.st_info), elf_name);
                free(demangled);
@@ -1101,21 +1114,16 @@ new_symbol:
 
                arch__sym_update(f, &sym);
 
-               if (filter && filter(curr_map, f))
-                       symbol__delete(f);
-               else {
-                       symbols__insert(&curr_dso->symbols[curr_map->type], f);
-                       nr++;
-               }
+               __symbols__insert(&curr_dso->symbols[curr_map->type], f, dso->kernel);
+               nr++;
        }
 
        /*
         * For misannotated, zeroed, ASM function sizes.
         */
        if (nr > 0) {
-               if (!symbol_conf.allow_aliases)
-                       symbols__fixup_duplicate(&dso->symbols[map->type]);
                symbols__fixup_end(&dso->symbols[map->type]);
+               symbols__fixup_duplicate(&dso->symbols[map->type]);
                if (kmap) {
                        /*
                         * We need to fixup this here too because we create new
index 48906333a858c06b41991f33cb0f5f2cdd4f68d9..11cdde9805455b52563af827a9d9d352bd2ede35 100644 (file)
@@ -287,8 +287,7 @@ void symsrc__destroy(struct symsrc *ss)
 
 int dso__synthesize_plt_symbols(struct dso *dso __maybe_unused,
                                struct symsrc *ss __maybe_unused,
-                               struct map *map __maybe_unused,
-                               symbol_filter_t filter __maybe_unused)
+                               struct map *map __maybe_unused)
 {
        return 0;
 }
@@ -334,7 +333,6 @@ enum dso_type dso__type_fd(int fd)
 int dso__load_sym(struct dso *dso, struct map *map __maybe_unused,
                  struct symsrc *ss,
                  struct symsrc *runtime_ss __maybe_unused,
-                 symbol_filter_t filter __maybe_unused,
                  int kmodule __maybe_unused)
 {
        unsigned char build_id[BUILD_ID_SIZE];
index 37e8d20ae03e29ef2a9fbc48e048858a4f820785..19c9c558454f671684938147b1061635b3f188f0 100644 (file)
@@ -9,6 +9,7 @@
 #include <fcntl.h>
 #include <unistd.h>
 #include <inttypes.h>
+#include "annotate.h"
 #include "build-id.h"
 #include "util.h"
 #include "debug.h"
 #include <symbol/kallsyms.h>
 #include <sys/utsname.h>
 
-static int dso__load_kernel_sym(struct dso *dso, struct map *map,
-                               symbol_filter_t filter);
-static int dso__load_guest_kernel_sym(struct dso *dso, struct map *map,
-                       symbol_filter_t filter);
+static int dso__load_kernel_sym(struct dso *dso, struct map *map);
+static int dso__load_guest_kernel_sym(struct dso *dso, struct map *map);
+static bool symbol__is_idle(const char *name);
+
 int vmlinux_path__nr_entries;
 char **vmlinux_path;
 
@@ -152,6 +153,9 @@ void symbols__fixup_duplicate(struct rb_root *symbols)
        struct rb_node *nd;
        struct symbol *curr, *next;
 
+       if (symbol_conf.allow_aliases)
+               return;
+
        nd = rb_first(symbols);
 
        while (nd) {
@@ -235,8 +239,13 @@ struct symbol *symbol__new(u64 start, u64 len, u8 binding, const char *name)
        if (sym == NULL)
                return NULL;
 
-       if (symbol_conf.priv_size)
+       if (symbol_conf.priv_size) {
+               if (symbol_conf.init_annotation) {
+                       struct annotation *notes = (void *)sym;
+                       pthread_mutex_init(&notes->lock, NULL);
+               }
                sym = ((void *)sym) + symbol_conf.priv_size;
+       }
 
        sym->start   = start;
        sym->end     = len ? start + len : start;
@@ -268,13 +277,24 @@ void symbols__delete(struct rb_root *symbols)
        }
 }
 
-void symbols__insert(struct rb_root *symbols, struct symbol *sym)
+void __symbols__insert(struct rb_root *symbols, struct symbol *sym, bool kernel)
 {
        struct rb_node **p = &symbols->rb_node;
        struct rb_node *parent = NULL;
        const u64 ip = sym->start;
        struct symbol *s;
 
+       if (kernel) {
+               const char *name = sym->name;
+               /*
+                * ppc64 uses function descriptors and appends a '.' to the
+                * start of every instruction address. Remove it.
+                */
+               if (name[0] == '.')
+                       name++;
+               sym->idle = symbol__is_idle(name);
+       }
+
        while (*p != NULL) {
                parent = *p;
                s = rb_entry(parent, struct symbol, rb_node);
@@ -287,6 +307,11 @@ void symbols__insert(struct rb_root *symbols, struct symbol *sym)
        rb_insert_color(&sym->rb_node, symbols);
 }
 
+void symbols__insert(struct rb_root *symbols, struct symbol *sym)
+{
+       __symbols__insert(symbols, sym, false);
+}
+
 static struct symbol *symbols__find(struct rb_root *symbols, u64 ip)
 {
        struct rb_node *n;
@@ -415,7 +440,7 @@ void dso__reset_find_symbol_cache(struct dso *dso)
 
 void dso__insert_symbol(struct dso *dso, enum map_type type, struct symbol *sym)
 {
-       symbols__insert(&dso->symbols[type], sym);
+       __symbols__insert(&dso->symbols[type], sym, dso->kernel);
 
        /* update the symbol cache if necessary */
        if (dso->last_find_result[type].addr >= sym->start &&
@@ -537,7 +562,7 @@ struct process_kallsyms_args {
  * These are symbols in the kernel image, so make sure that
  * sym is from a kernel DSO.
  */
-bool symbol__is_idle(struct symbol *sym)
+static bool symbol__is_idle(const char *name)
 {
        const char * const idle_symbols[] = {
                "cpu_idle",
@@ -554,14 +579,10 @@ bool symbol__is_idle(struct symbol *sym)
                "pseries_dedicated_idle_sleep",
                NULL
        };
-
        int i;
 
-       if (!sym)
-               return false;
-
        for (i = 0; idle_symbols[i]; i++) {
-               if (!strcmp(idle_symbols[i], sym->name))
+               if (!strcmp(idle_symbols[i], name))
                        return true;
        }
 
@@ -590,7 +611,7 @@ static int map__process_kallsym_symbol(void *arg, const char *name,
         * We will pass the symbols to the filter later, in
         * map__split_kallsyms, when we have split the maps per module
         */
-       symbols__insert(root, sym);
+       __symbols__insert(root, sym, !strchr(name, '['));
 
        return 0;
 }
@@ -607,8 +628,7 @@ static int dso__load_all_kallsyms(struct dso *dso, const char *filename,
        return kallsyms__parse(filename, &args, map__process_kallsym_symbol);
 }
 
-static int dso__split_kallsyms_for_kcore(struct dso *dso, struct map *map,
-                                        symbol_filter_t filter)
+static int dso__split_kallsyms_for_kcore(struct dso *dso, struct map *map)
 {
        struct map_groups *kmaps = map__kmaps(map);
        struct map *curr_map;
@@ -637,7 +657,7 @@ static int dso__split_kallsyms_for_kcore(struct dso *dso, struct map *map,
 
                curr_map = map_groups__find(kmaps, map->type, pos->start);
 
-               if (!curr_map || (filter && filter(curr_map, pos))) {
+               if (!curr_map) {
                        symbol__delete(pos);
                        continue;
                }
@@ -660,8 +680,7 @@ static int dso__split_kallsyms_for_kcore(struct dso *dso, struct map *map,
  * kernel range is broken in several maps, named [kernel].N, as we don't have
  * the original ELF section names vmlinux have.
  */
-static int dso__split_kallsyms(struct dso *dso, struct map *map, u64 delta,
-                              symbol_filter_t filter)
+static int dso__split_kallsyms(struct dso *dso, struct map *map, u64 delta)
 {
        struct map_groups *kmaps = map__kmaps(map);
        struct machine *machine;
@@ -738,7 +757,7 @@ static int dso__split_kallsyms(struct dso *dso, struct map *map, u64 delta,
 
                        if (count == 0) {
                                curr_map = map;
-                               goto filter_symbol;
+                               goto add_symbol;
                        }
 
                        if (dso->kernel == DSO_TYPE_GUEST_KERNEL)
@@ -770,18 +789,18 @@ static int dso__split_kallsyms(struct dso *dso, struct map *map, u64 delta,
                        pos->start -= delta;
                        pos->end -= delta;
                }
-filter_symbol:
-               if (filter && filter(curr_map, pos)) {
-discard_symbol:                rb_erase(&pos->rb_node, root);
-                       symbol__delete(pos);
-               } else {
-                       if (curr_map != map) {
-                               rb_erase(&pos->rb_node, root);
-                               symbols__insert(&curr_map->dso->symbols[curr_map->type], pos);
-                               ++moved;
-                       } else
-                               ++count;
-               }
+add_symbol:
+               if (curr_map != map) {
+                       rb_erase(&pos->rb_node, root);
+                       symbols__insert(&curr_map->dso->symbols[curr_map->type], pos);
+                       ++moved;
+               } else
+                       ++count;
+
+               continue;
+discard_symbol:
+               rb_erase(&pos->rb_node, root);
+               symbol__delete(pos);
        }
 
        if (curr_map != map &&
@@ -1221,7 +1240,7 @@ static int kallsyms__delta(struct map *map, const char *filename, u64 *delta)
 }
 
 int __dso__load_kallsyms(struct dso *dso, const char *filename,
-                        struct map *map, bool no_kcore, symbol_filter_t filter)
+                        struct map *map, bool no_kcore)
 {
        u64 delta = 0;
 
@@ -1234,8 +1253,8 @@ int __dso__load_kallsyms(struct dso *dso, const char *filename,
        if (kallsyms__delta(map, filename, &delta))
                return -1;
 
-       symbols__fixup_duplicate(&dso->symbols[map->type]);
        symbols__fixup_end(&dso->symbols[map->type]);
+       symbols__fixup_duplicate(&dso->symbols[map->type]);
 
        if (dso->kernel == DSO_TYPE_GUEST_KERNEL)
                dso->symtab_type = DSO_BINARY_TYPE__GUEST_KALLSYMS;
@@ -1243,19 +1262,18 @@ int __dso__load_kallsyms(struct dso *dso, const char *filename,
                dso->symtab_type = DSO_BINARY_TYPE__KALLSYMS;
 
        if (!no_kcore && !dso__load_kcore(dso, map, filename))
-               return dso__split_kallsyms_for_kcore(dso, map, filter);
+               return dso__split_kallsyms_for_kcore(dso, map);
        else
-               return dso__split_kallsyms(dso, map, delta, filter);
+               return dso__split_kallsyms(dso, map, delta);
 }
 
 int dso__load_kallsyms(struct dso *dso, const char *filename,
-                      struct map *map, symbol_filter_t filter)
+                      struct map *map)
 {
-       return __dso__load_kallsyms(dso, filename, map, false, filter);
+       return __dso__load_kallsyms(dso, filename, map, false);
 }
 
-static int dso__load_perf_map(struct dso *dso, struct map *map,
-                             symbol_filter_t filter)
+static int dso__load_perf_map(struct dso *dso, struct map *map)
 {
        char *line = NULL;
        size_t n;
@@ -1297,12 +1315,8 @@ static int dso__load_perf_map(struct dso *dso, struct map *map,
                if (sym == NULL)
                        goto out_delete_line;
 
-               if (filter && filter(map, sym))
-                       symbol__delete(sym);
-               else {
-                       symbols__insert(&dso->symbols[map->type], sym);
-                       nr_syms++;
-               }
+               symbols__insert(&dso->symbols[map->type], sym);
+               nr_syms++;
        }
 
        free(line);
@@ -1358,7 +1372,7 @@ static bool dso__is_compatible_symtab_type(struct dso *dso, bool kmod,
        }
 }
 
-int dso__load(struct dso *dso, struct map *map, symbol_filter_t filter)
+int dso__load(struct dso *dso, struct map *map)
 {
        char *name;
        int ret = -1;
@@ -1381,9 +1395,9 @@ int dso__load(struct dso *dso, struct map *map, symbol_filter_t filter)
 
        if (dso->kernel) {
                if (dso->kernel == DSO_TYPE_KERNEL)
-                       ret = dso__load_kernel_sym(dso, map, filter);
+                       ret = dso__load_kernel_sym(dso, map);
                else if (dso->kernel == DSO_TYPE_GUEST_KERNEL)
-                       ret = dso__load_guest_kernel_sym(dso, map, filter);
+                       ret = dso__load_guest_kernel_sym(dso, map);
 
                goto out;
        }
@@ -1407,7 +1421,7 @@ int dso__load(struct dso *dso, struct map *map, symbol_filter_t filter)
                        goto out;
                }
 
-               ret = dso__load_perf_map(dso, map, filter);
+               ret = dso__load_perf_map(dso, map);
                dso->symtab_type = ret > 0 ? DSO_BINARY_TYPE__JAVA_JIT :
                                             DSO_BINARY_TYPE__NOT_FOUND;
                goto out;
@@ -1498,14 +1512,14 @@ int dso__load(struct dso *dso, struct map *map, symbol_filter_t filter)
                        kmod = true;
 
        if (syms_ss)
-               ret = dso__load_sym(dso, map, syms_ss, runtime_ss, filter, kmod);
+               ret = dso__load_sym(dso, map, syms_ss, runtime_ss, kmod);
        else
                ret = -1;
 
        if (ret > 0) {
                int nr_plt;
 
-               nr_plt = dso__synthesize_plt_symbols(dso, runtime_ss, map, filter);
+               nr_plt = dso__synthesize_plt_symbols(dso, runtime_ss, map);
                if (nr_plt > 0)
                        ret += nr_plt;
        }
@@ -1544,8 +1558,7 @@ out_unlock:
 }
 
 int dso__load_vmlinux(struct dso *dso, struct map *map,
-                     const char *vmlinux, bool vmlinux_allocated,
-                     symbol_filter_t filter)
+                     const char *vmlinux, bool vmlinux_allocated)
 {
        int err = -1;
        struct symsrc ss;
@@ -1565,7 +1578,7 @@ int dso__load_vmlinux(struct dso *dso, struct map *map,
        if (symsrc__init(&ss, dso, symfs_vmlinux, symtab_type))
                return -1;
 
-       err = dso__load_sym(dso, map, &ss, &ss, filter, 0);
+       err = dso__load_sym(dso, map, &ss, &ss, 0);
        symsrc__destroy(&ss);
 
        if (err > 0) {
@@ -1581,8 +1594,7 @@ int dso__load_vmlinux(struct dso *dso, struct map *map,
        return err;
 }
 
-int dso__load_vmlinux_path(struct dso *dso, struct map *map,
-                          symbol_filter_t filter)
+int dso__load_vmlinux_path(struct dso *dso, struct map *map)
 {
        int i, err = 0;
        char *filename = NULL;
@@ -1591,7 +1603,7 @@ int dso__load_vmlinux_path(struct dso *dso, struct map *map,
                 vmlinux_path__nr_entries + 1);
 
        for (i = 0; i < vmlinux_path__nr_entries; ++i) {
-               err = dso__load_vmlinux(dso, map, vmlinux_path[i], false, filter);
+               err = dso__load_vmlinux(dso, map, vmlinux_path[i], false);
                if (err > 0)
                        goto out;
        }
@@ -1599,7 +1611,7 @@ int dso__load_vmlinux_path(struct dso *dso, struct map *map,
        if (!symbol_conf.ignore_vmlinux_buildid)
                filename = dso__build_id_filename(dso, NULL, 0);
        if (filename != NULL) {
-               err = dso__load_vmlinux(dso, map, filename, true, filter);
+               err = dso__load_vmlinux(dso, map, filename, true);
                if (err > 0)
                        goto out;
                free(filename);
@@ -1713,8 +1725,7 @@ proc_kallsyms:
        return strdup(path);
 }
 
-static int dso__load_kernel_sym(struct dso *dso, struct map *map,
-                               symbol_filter_t filter)
+static int dso__load_kernel_sym(struct dso *dso, struct map *map)
 {
        int err;
        const char *kallsyms_filename = NULL;
@@ -1740,12 +1751,11 @@ static int dso__load_kernel_sym(struct dso *dso, struct map *map,
        }
 
        if (!symbol_conf.ignore_vmlinux && symbol_conf.vmlinux_name != NULL) {
-               return dso__load_vmlinux(dso, map, symbol_conf.vmlinux_name,
-                                        false, filter);
+               return dso__load_vmlinux(dso, map, symbol_conf.vmlinux_name, false);
        }
 
        if (!symbol_conf.ignore_vmlinux && vmlinux_path != NULL) {
-               err = dso__load_vmlinux_path(dso, map, filter);
+               err = dso__load_vmlinux_path(dso, map);
                if (err > 0)
                        return err;
        }
@@ -1761,7 +1771,7 @@ static int dso__load_kernel_sym(struct dso *dso, struct map *map,
        kallsyms_filename = kallsyms_allocated_filename;
 
 do_kallsyms:
-       err = dso__load_kallsyms(dso, kallsyms_filename, map, filter);
+       err = dso__load_kallsyms(dso, kallsyms_filename, map);
        if (err > 0)
                pr_debug("Using %s for symbols\n", kallsyms_filename);
        free(kallsyms_allocated_filename);
@@ -1776,8 +1786,7 @@ do_kallsyms:
        return err;
 }
 
-static int dso__load_guest_kernel_sym(struct dso *dso, struct map *map,
-                                     symbol_filter_t filter)
+static int dso__load_guest_kernel_sym(struct dso *dso, struct map *map)
 {
        int err;
        const char *kallsyms_filename = NULL;
@@ -1799,7 +1808,7 @@ static int dso__load_guest_kernel_sym(struct dso *dso, struct map *map,
                if (symbol_conf.default_guest_vmlinux_name != NULL) {
                        err = dso__load_vmlinux(dso, map,
                                                symbol_conf.default_guest_vmlinux_name,
-                                               false, filter);
+                                               false);
                        return err;
                }
 
@@ -1811,7 +1820,7 @@ static int dso__load_guest_kernel_sym(struct dso *dso, struct map *map,
                kallsyms_filename = path;
        }
 
-       err = dso__load_kallsyms(dso, kallsyms_filename, map, filter);
+       err = dso__load_kallsyms(dso, kallsyms_filename, map);
        if (err > 0)
                pr_debug("Using %s for symbols\n", kallsyms_filename);
        if (err > 0 && !dso__is_kcore(dso)) {
@@ -1948,6 +1957,23 @@ static bool symbol__read_kptr_restrict(void)
        return value;
 }
 
+int symbol__annotation_init(void)
+{
+       if (symbol_conf.initialized) {
+               pr_err("Annotation needs to be init before symbol__init()\n");
+               return -1;
+       }
+
+       if (symbol_conf.init_annotation) {
+               pr_warning("Annotation being initialized multiple times\n");
+               return 0;
+       }
+
+       symbol_conf.priv_size += sizeof(struct annotation);
+       symbol_conf.init_annotation = true;
+       return 0;
+}
+
 int symbol__init(struct perf_env *env)
 {
        const char *symfs;
index 699f7cbcfe720b3ac9ed4fbf0e110db1136aae35..0dacfb7d5b67b0b39bd481342e09bed3600ef7d9 100644 (file)
@@ -57,7 +57,7 @@ struct symbol {
        u64             end;
        u16             namelen;
        u8              binding;
-       bool            ignore;
+       u8              idle:1;
        u8              arch_sym;
        char            name[0];
 };
@@ -88,6 +88,7 @@ struct symbol_conf {
        unsigned short  priv_size;
        unsigned short  nr_events;
        bool            try_vmlinux_path,
+                       init_annotation,
                        force,
                        ignore_vmlinux,
                        ignore_vmlinux_buildid,
@@ -240,16 +241,13 @@ int symsrc__init(struct symsrc *ss, struct dso *dso, const char *name,
 bool symsrc__has_symtab(struct symsrc *ss);
 bool symsrc__possibly_runtime(struct symsrc *ss);
 
-int dso__load(struct dso *dso, struct map *map, symbol_filter_t filter);
+int dso__load(struct dso *dso, struct map *map);
 int dso__load_vmlinux(struct dso *dso, struct map *map,
-                     const char *vmlinux, bool vmlinux_allocated,
-                     symbol_filter_t filter);
-int dso__load_vmlinux_path(struct dso *dso, struct map *map,
-                          symbol_filter_t filter);
+                     const char *vmlinux, bool vmlinux_allocated);
+int dso__load_vmlinux_path(struct dso *dso, struct map *map);
 int __dso__load_kallsyms(struct dso *dso, const char *filename, struct map *map,
-                        bool no_kcore, symbol_filter_t filter);
-int dso__load_kallsyms(struct dso *dso, const char *filename, struct map *map,
-                      symbol_filter_t filter);
+                        bool no_kcore);
+int dso__load_kallsyms(struct dso *dso, const char *filename, struct map *map);
 
 void dso__insert_symbol(struct dso *dso, enum map_type type,
                        struct symbol *sym);
@@ -277,6 +275,8 @@ struct perf_env;
 int symbol__init(struct perf_env *env);
 void symbol__exit(void);
 void symbol__elf_init(void);
+int symbol__annotation_init(void);
+
 struct symbol *symbol__new(u64 start, u64 len, u8 binding, const char *name);
 size_t __symbol__fprintf_symname_offs(const struct symbol *sym,
                                      const struct addr_location *al,
@@ -291,16 +291,15 @@ size_t symbol__fprintf(struct symbol *sym, FILE *fp);
 bool symbol_type__is_a(char symbol_type, enum map_type map_type);
 bool symbol__restricted_filename(const char *filename,
                                 const char *restricted_filename);
-bool symbol__is_idle(struct symbol *sym);
 int symbol__config_symfs(const struct option *opt __maybe_unused,
                         const char *dir, int unset __maybe_unused);
 
 int dso__load_sym(struct dso *dso, struct map *map, struct symsrc *syms_ss,
-                 struct symsrc *runtime_ss, symbol_filter_t filter,
-                 int kmodule);
+                 struct symsrc *runtime_ss, int kmodule);
 int dso__synthesize_plt_symbols(struct dso *dso, struct symsrc *ss,
-                               struct map *map, symbol_filter_t filter);
+                               struct map *map);
 
+void __symbols__insert(struct rb_root *symbols, struct symbol *sym, bool kernel);
 void symbols__insert(struct rb_root *symbols, struct symbol *sym);
 void symbols__fixup_duplicate(struct rb_root *symbols);
 void symbols__fixup_end(struct rb_root *symbols);
index cee559d8c9e8e14ebedbbb5c8de081c00af7f63d..85c56800f17a6018eb080db40e9a08b03f9a460c 100644 (file)
@@ -15,6 +15,7 @@
 #include <byteswap.h>
 #include <linux/kernel.h>
 #include <linux/log2.h>
+#include <linux/time64.h>
 #include <unistd.h>
 #include "callchain.h"
 #include "strlist.h"
index e5f55477491d6c60f02053f9b19ae1ba57eb5b80..43899e0d6fa148d80ebc08643d1fa9cf73ef93e9 100644 (file)
@@ -179,10 +179,6 @@ static inline void *zalloc(size_t size)
 #undef tolower
 #undef toupper
 
-#ifndef NSEC_PER_MSEC
-#define NSEC_PER_MSEC  1000000L
-#endif
-
 int parse_nsec_time(const char *str, u64 *ptime);
 
 extern unsigned char sane_ctype[256];
index 60a40459f2698d1b0f3f91b14c16b846eb05fe34..7cf412103205b1f49b29350fdd93d4f496307e16 100644 (file)
@@ -7,19 +7,8 @@
 #define CPU_DOWN_PREPARE       0x0005 /* CPU (unsigned)v going down */
 #define CPU_DOWN_FAILED                0x0006 /* CPU (unsigned)v NOT going down */
 #define CPU_DEAD               0x0007 /* CPU (unsigned)v dead */
-#define CPU_DYING              0x0008 /* CPU (unsigned)v not running any task,
-                                       * not handling interrupts, soon dead.
-                                       * Called on the dying cpu, interrupts
-                                       * are already disabled. Must not
-                                       * sleep, must not fail */
 #define CPU_POST_DEAD          0x0009 /* CPU (unsigned)v dead, cpu_hotplug
                                        * lock is dropped */
-#define CPU_STARTING           0x000A /* CPU (unsigned)v soon running.
-                                       * Called on the new cpu, just before
-                                       * enabling interrupts. Must not sleep,
-                                       * must not fail */
-#define CPU_DYING_IDLE         0x000B /* CPU (unsigned)v dying, reached
-                                       * idle loop. */
 #define CPU_BROKEN             0x000C /* CPU (unsigned)v did not die properly,
                                        * perhaps due to preemption. */
 #define CPU_TASKS_FROZEN       0x0010
@@ -30,5 +19,3 @@
 #define CPU_DOWN_PREPARE_FROZEN (CPU_DOWN_PREPARE | CPU_TASKS_FROZEN)
 #define CPU_DOWN_FAILED_FROZEN  (CPU_DOWN_FAILED | CPU_TASKS_FROZEN)
 #define CPU_DEAD_FROZEN                (CPU_DEAD | CPU_TASKS_FROZEN)
-#define CPU_DYING_FROZEN       (CPU_DYING | CPU_TASKS_FROZEN)
-#define CPU_STARTING_FROZEN    (CPU_STARTING | CPU_TASKS_FROZEN)
index 4f747ee07f10a93447642ecf9d52e7d5e93f524b..a89f80a5b711683c6ec4c64acb5729cc8a1c9715 100644 (file)
@@ -5,7 +5,8 @@ include ../lib.mk
 .PHONY: all all_32 all_64 warn_32bit_failure clean
 
 TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt ptrace_syscall test_mremap_vdso \
-                       check_initial_reg_state sigreturn ldt_gdt iopl mpx-mini-test
+                       check_initial_reg_state sigreturn ldt_gdt iopl \
+                       protection_keys
 TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault test_syscall_vdso unwind_vdso \
                        test_FCMOV test_FCOMI test_FISTTP \
                        vdso_restorer
diff --git a/tools/testing/selftests/x86/pkey-helpers.h b/tools/testing/selftests/x86/pkey-helpers.h
new file mode 100644 (file)
index 0000000..b202939
--- /dev/null
@@ -0,0 +1,219 @@
+#ifndef _PKEYS_HELPER_H
+#define _PKEYS_HELPER_H
+#define _GNU_SOURCE
+#include <string.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <signal.h>
+#include <assert.h>
+#include <stdlib.h>
+#include <ucontext.h>
+#include <sys/mman.h>
+
+#define NR_PKEYS 16
+#define PKRU_BITS_PER_PKEY 2
+
+#ifndef DEBUG_LEVEL
+#define DEBUG_LEVEL 0
+#endif
+#define DPRINT_IN_SIGNAL_BUF_SIZE 4096
+extern int dprint_in_signal;
+extern char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE];
+static inline void sigsafe_printf(const char *format, ...)
+{
+       va_list ap;
+
+       va_start(ap, format);
+       if (!dprint_in_signal) {
+               vprintf(format, ap);
+       } else {
+               int len = vsnprintf(dprint_in_signal_buffer,
+                                   DPRINT_IN_SIGNAL_BUF_SIZE,
+                                   format, ap);
+               /*
+                * len is amount that would have been printed,
+                * but actual write is truncated at BUF_SIZE.
+                */
+               if (len > DPRINT_IN_SIGNAL_BUF_SIZE)
+                       len = DPRINT_IN_SIGNAL_BUF_SIZE;
+               write(1, dprint_in_signal_buffer, len);
+       }
+       va_end(ap);
+}
+#define dprintf_level(level, args...) do {     \
+       if (level <= DEBUG_LEVEL)               \
+               sigsafe_printf(args);           \
+       fflush(NULL);                           \
+} while (0)
+#define dprintf0(args...) dprintf_level(0, args)
+#define dprintf1(args...) dprintf_level(1, args)
+#define dprintf2(args...) dprintf_level(2, args)
+#define dprintf3(args...) dprintf_level(3, args)
+#define dprintf4(args...) dprintf_level(4, args)
+
+extern unsigned int shadow_pkru;
+static inline unsigned int __rdpkru(void)
+{
+       unsigned int eax, edx;
+       unsigned int ecx = 0;
+       unsigned int pkru;
+
+       asm volatile(".byte 0x0f,0x01,0xee\n\t"
+                    : "=a" (eax), "=d" (edx)
+                    : "c" (ecx));
+       pkru = eax;
+       return pkru;
+}
+
+static inline unsigned int _rdpkru(int line)
+{
+       unsigned int pkru = __rdpkru();
+
+       dprintf4("rdpkru(line=%d) pkru: %x shadow: %x\n",
+                       line, pkru, shadow_pkru);
+       assert(pkru == shadow_pkru);
+
+       return pkru;
+}
+
+#define rdpkru() _rdpkru(__LINE__)
+
+static inline void __wrpkru(unsigned int pkru)
+{
+       unsigned int eax = pkru;
+       unsigned int ecx = 0;
+       unsigned int edx = 0;
+
+       dprintf4("%s() changing %08x to %08x\n", __func__, __rdpkru(), pkru);
+       asm volatile(".byte 0x0f,0x01,0xef\n\t"
+                    : : "a" (eax), "c" (ecx), "d" (edx));
+       assert(pkru == __rdpkru());
+}
+
+static inline void wrpkru(unsigned int pkru)
+{
+       dprintf4("%s() changing %08x to %08x\n", __func__, __rdpkru(), pkru);
+       /* will do the shadow check for us: */
+       rdpkru();
+       __wrpkru(pkru);
+       shadow_pkru = pkru;
+       dprintf4("%s(%08x) pkru: %08x\n", __func__, pkru, __rdpkru());
+}
+
+/*
+ * These are technically racy. since something could
+ * change PKRU between the read and the write.
+ */
+static inline void __pkey_access_allow(int pkey, int do_allow)
+{
+       unsigned int pkru = rdpkru();
+       int bit = pkey * 2;
+
+       if (do_allow)
+               pkru &= (1<<bit);
+       else
+               pkru |= (1<<bit);
+
+       dprintf4("pkru now: %08x\n", rdpkru());
+       wrpkru(pkru);
+}
+
+static inline void __pkey_write_allow(int pkey, int do_allow_write)
+{
+       long pkru = rdpkru();
+       int bit = pkey * 2 + 1;
+
+       if (do_allow_write)
+               pkru &= (1<<bit);
+       else
+               pkru |= (1<<bit);
+
+       wrpkru(pkru);
+       dprintf4("pkru now: %08x\n", rdpkru());
+}
+
+#define PROT_PKEY0     0x10            /* protection key value (bit 0) */
+#define PROT_PKEY1     0x20            /* protection key value (bit 1) */
+#define PROT_PKEY2     0x40            /* protection key value (bit 2) */
+#define PROT_PKEY3     0x80            /* protection key value (bit 3) */
+
+#define PAGE_SIZE 4096
+#define MB     (1<<20)
+
+static inline void __cpuid(unsigned int *eax, unsigned int *ebx,
+               unsigned int *ecx, unsigned int *edx)
+{
+       /* ecx is often an input as well as an output. */
+       asm volatile(
+               "cpuid;"
+               : "=a" (*eax),
+                 "=b" (*ebx),
+                 "=c" (*ecx),
+                 "=d" (*edx)
+               : "0" (*eax), "2" (*ecx));
+}
+
+/* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx) */
+#define X86_FEATURE_PKU        (1<<3) /* Protection Keys for Userspace */
+#define X86_FEATURE_OSPKE      (1<<4) /* OS Protection Keys Enable */
+
+static inline int cpu_has_pku(void)
+{
+       unsigned int eax;
+       unsigned int ebx;
+       unsigned int ecx;
+       unsigned int edx;
+
+       eax = 0x7;
+       ecx = 0x0;
+       __cpuid(&eax, &ebx, &ecx, &edx);
+
+       if (!(ecx & X86_FEATURE_PKU)) {
+               dprintf2("cpu does not have PKU\n");
+               return 0;
+       }
+       if (!(ecx & X86_FEATURE_OSPKE)) {
+               dprintf2("cpu does not have OSPKE\n");
+               return 0;
+       }
+       return 1;
+}
+
+#define XSTATE_PKRU_BIT        (9)
+#define XSTATE_PKRU    0x200
+
+int pkru_xstate_offset(void)
+{
+       unsigned int eax;
+       unsigned int ebx;
+       unsigned int ecx;
+       unsigned int edx;
+       int xstate_offset;
+       int xstate_size;
+       unsigned long XSTATE_CPUID = 0xd;
+       int leaf;
+
+       /* assume that XSTATE_PKRU is set in XCR0 */
+       leaf = XSTATE_PKRU_BIT;
+       {
+               eax = XSTATE_CPUID;
+               ecx = leaf;
+               __cpuid(&eax, &ebx, &ecx, &edx);
+
+               if (leaf == XSTATE_PKRU_BIT) {
+                       xstate_offset = ebx;
+                       xstate_size = eax;
+               }
+       }
+
+       if (xstate_size == 0) {
+               printf("could not find size/offset of PKRU in xsave state\n");
+               return 0;
+       }
+
+       return xstate_offset;
+}
+
+#endif /* _PKEYS_HELPER_H */
diff --git a/tools/testing/selftests/x86/protection_keys.c b/tools/testing/selftests/x86/protection_keys.c
new file mode 100644 (file)
index 0000000..bdd58c7
--- /dev/null
@@ -0,0 +1,1410 @@
+/*
+ * Tests x86 Memory Protection Keys (see Documentation/x86/protection-keys.txt)
+ *
+ * There are examples in here of:
+ *  * how to set protection keys on memory
+ *  * how to set/clear bits in PKRU (the rights register)
+ *  * how to handle SEGV_PKRU signals and extract pkey-relevant
+ *    information from the siginfo
+ *
+ * Things to add:
+ *     make sure KSM and KSM COW breaking works
+ *     prefault pages in at malloc, or not
+ *     protect MPX bounds tables with protection keys?
+ *     make sure VMA splitting/merging is working correctly
+ *     OOMs can destroy mm->mmap (see exit_mmap()), so make sure it is immune to pkeys
+ *     look for pkey "leaks" where it is still set on a VMA but "freed" back to the kernel
+ *     do a plain mprotect() to a mprotect_pkey() area and make sure the pkey sticks
+ *
+ * Compile like this:
+ *     gcc      -o protection_keys    -O2 -g -std=gnu99 -pthread -Wall protection_keys.c -lrt -ldl -lm
+ *     gcc -m32 -o protection_keys_32 -O2 -g -std=gnu99 -pthread -Wall protection_keys.c -lrt -ldl -lm
+ */
+#define _GNU_SOURCE
+#include <errno.h>
+#include <linux/futex.h>
+#include <sys/time.h>
+#include <sys/syscall.h>
+#include <string.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <signal.h>
+#include <assert.h>
+#include <stdlib.h>
+#include <ucontext.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <sys/ptrace.h>
+#include <setjmp.h>
+
+#include "pkey-helpers.h"
+
+int iteration_nr = 1;
+int test_nr;
+
+unsigned int shadow_pkru;
+
+#define HPAGE_SIZE     (1UL<<21)
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x)))
+#define ALIGN_UP(x, align_to)  (((x) + ((align_to)-1)) & ~((align_to)-1))
+#define ALIGN_DOWN(x, align_to) ((x) & ~((align_to)-1))
+#define ALIGN_PTR_UP(p, ptr_align_to)  ((typeof(p))ALIGN_UP((unsigned long)(p),        ptr_align_to))
+#define ALIGN_PTR_DOWN(p, ptr_align_to)        ((typeof(p))ALIGN_DOWN((unsigned long)(p),      ptr_align_to))
+#define __stringify_1(x...)     #x
+#define __stringify(x...)       __stringify_1(x)
+
+#define PTR_ERR_ENOTSUP ((void *)-ENOTSUP)
+
+int dprint_in_signal;
+char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE];
+
+extern void abort_hooks(void);
+#define pkey_assert(condition) do {            \
+       if (!(condition)) {                     \
+               dprintf0("assert() at %s::%d test_nr: %d iteration: %d\n", \
+                               __FILE__, __LINE__,     \
+                               test_nr, iteration_nr); \
+               dprintf0("errno at assert: %d", errno); \
+               abort_hooks();                  \
+               assert(condition);              \
+       }                                       \
+} while (0)
+#define raw_assert(cond) assert(cond)
+
+void cat_into_file(char *str, char *file)
+{
+       int fd = open(file, O_RDWR);
+       int ret;
+
+       dprintf2("%s(): writing '%s' to '%s'\n", __func__, str, file);
+       /*
+        * these need to be raw because they are called under
+        * pkey_assert()
+        */
+       raw_assert(fd >= 0);
+       ret = write(fd, str, strlen(str));
+       if (ret != strlen(str)) {
+               perror("write to file failed");
+               fprintf(stderr, "filename: '%s' str: '%s'\n", file, str);
+               raw_assert(0);
+       }
+       close(fd);
+}
+
+#if CONTROL_TRACING > 0
+static int warned_tracing;
+int tracing_root_ok(void)
+{
+       if (geteuid() != 0) {
+               if (!warned_tracing)
+                       fprintf(stderr, "WARNING: not run as root, "
+                                       "can not do tracing control\n");
+               warned_tracing = 1;
+               return 0;
+       }
+       return 1;
+}
+#endif
+
+void tracing_on(void)
+{
+#if CONTROL_TRACING > 0
+#define TRACEDIR "/sys/kernel/debug/tracing"
+       char pidstr[32];
+
+       if (!tracing_root_ok())
+               return;
+
+       sprintf(pidstr, "%d", getpid());
+       cat_into_file("0", TRACEDIR "/tracing_on");
+       cat_into_file("\n", TRACEDIR "/trace");
+       if (1) {
+               cat_into_file("function_graph", TRACEDIR "/current_tracer");
+               cat_into_file("1", TRACEDIR "/options/funcgraph-proc");
+       } else {
+               cat_into_file("nop", TRACEDIR "/current_tracer");
+       }
+       cat_into_file(pidstr, TRACEDIR "/set_ftrace_pid");
+       cat_into_file("1", TRACEDIR "/tracing_on");
+       dprintf1("enabled tracing\n");
+#endif
+}
+
+void tracing_off(void)
+{
+#if CONTROL_TRACING > 0
+       if (!tracing_root_ok())
+               return;
+       cat_into_file("0", "/sys/kernel/debug/tracing/tracing_on");
+#endif
+}
+
+void abort_hooks(void)
+{
+       fprintf(stderr, "running %s()...\n", __func__);
+       tracing_off();
+#ifdef SLEEP_ON_ABORT
+       sleep(SLEEP_ON_ABORT);
+#endif
+}
+
+static inline void __page_o_noops(void)
+{
+       /* 8-bytes of instruction * 512 bytes = 1 page */
+       asm(".rept 512 ; nopl 0x7eeeeeee(%eax) ; .endr");
+}
+
+/*
+ * This attempts to have roughly a page of instructions followed by a few
+ * instructions that do a write, and another page of instructions.  That
+ * way, we are pretty sure that the write is in the second page of
+ * instructions and has at least a page of padding behind it.
+ *
+ * *That* lets us be sure to madvise() away the write instruction, which
+ * will then fault, which makes sure that the fault code handles
+ * execute-only memory properly.
+ */
+__attribute__((__aligned__(PAGE_SIZE)))
+void lots_o_noops_around_write(int *write_to_me)
+{
+       dprintf3("running %s()\n", __func__);
+       __page_o_noops();
+       /* Assume this happens in the second page of instructions: */
+       *write_to_me = __LINE__;
+       /* pad out by another page: */
+       __page_o_noops();
+       dprintf3("%s() done\n", __func__);
+}
+
+/* Define some kernel-like types */
+#define  u8 uint8_t
+#define u16 uint16_t
+#define u32 uint32_t
+#define u64 uint64_t
+
+#ifdef __i386__
+#define SYS_mprotect_key 380
+#define SYS_pkey_alloc  381
+#define SYS_pkey_free   382
+#define REG_IP_IDX REG_EIP
+#define si_pkey_offset 0x18
+#else
+#define SYS_mprotect_key 329
+#define SYS_pkey_alloc  330
+#define SYS_pkey_free   331
+#define REG_IP_IDX REG_RIP
+#define si_pkey_offset 0x20
+#endif
+
+void dump_mem(void *dumpme, int len_bytes)
+{
+       char *c = (void *)dumpme;
+       int i;
+
+       for (i = 0; i < len_bytes; i += sizeof(u64)) {
+               u64 *ptr = (u64 *)(c + i);
+               dprintf1("dump[%03d][@%p]: %016jx\n", i, ptr, *ptr);
+       }
+}
+
+#define __SI_FAULT      (3 << 16)
+#define SEGV_BNDERR     (__SI_FAULT|3)  /* failed address bound checks */
+#define SEGV_PKUERR     (__SI_FAULT|4)
+
+static char *si_code_str(int si_code)
+{
+       if (si_code & SEGV_MAPERR)
+               return "SEGV_MAPERR";
+       if (si_code & SEGV_ACCERR)
+               return "SEGV_ACCERR";
+       if (si_code & SEGV_BNDERR)
+               return "SEGV_BNDERR";
+       if (si_code & SEGV_PKUERR)
+               return "SEGV_PKUERR";
+       return "UNKNOWN";
+}
+
+int pkru_faults;
+int last_si_pkey = -1;
+void signal_handler(int signum, siginfo_t *si, void *vucontext)
+{
+       ucontext_t *uctxt = vucontext;
+       int trapno;
+       unsigned long ip;
+       char *fpregs;
+       u32 *pkru_ptr;
+       u64 si_pkey;
+       u32 *si_pkey_ptr;
+       int pkru_offset;
+       fpregset_t fpregset;
+
+       dprint_in_signal = 1;
+       dprintf1(">>>>===============SIGSEGV============================\n");
+       dprintf1("%s()::%d, pkru: 0x%x shadow: %x\n", __func__, __LINE__,
+                       __rdpkru(), shadow_pkru);
+
+       trapno = uctxt->uc_mcontext.gregs[REG_TRAPNO];
+       ip = uctxt->uc_mcontext.gregs[REG_IP_IDX];
+       fpregset = uctxt->uc_mcontext.fpregs;
+       fpregs = (void *)fpregset;
+
+       dprintf2("%s() trapno: %d ip: 0x%lx info->si_code: %s/%d\n", __func__,
+                       trapno, ip, si_code_str(si->si_code), si->si_code);
+#ifdef __i386__
+       /*
+        * 32-bit has some extra padding so that userspace can tell whether
+        * the XSTATE header is present in addition to the "legacy" FPU
+        * state.  We just assume that it is here.
+        */
+       fpregs += 0x70;
+#endif
+       pkru_offset = pkru_xstate_offset();
+       pkru_ptr = (void *)(&fpregs[pkru_offset]);
+
+       dprintf1("siginfo: %p\n", si);
+       dprintf1(" fpregs: %p\n", fpregs);
+       /*
+        * If we got a PKRU fault, we *HAVE* to have at least one bit set in
+        * here.
+        */
+       dprintf1("pkru_xstate_offset: %d\n", pkru_xstate_offset());
+       if (DEBUG_LEVEL > 4)
+               dump_mem(pkru_ptr - 128, 256);
+       pkey_assert(*pkru_ptr);
+
+       si_pkey_ptr = (u32 *)(((u8 *)si) + si_pkey_offset);
+       dprintf1("si_pkey_ptr: %p\n", si_pkey_ptr);
+       dump_mem(si_pkey_ptr - 8, 24);
+       si_pkey = *si_pkey_ptr;
+       pkey_assert(si_pkey < NR_PKEYS);
+       last_si_pkey = si_pkey;
+
+       if ((si->si_code == SEGV_MAPERR) ||
+           (si->si_code == SEGV_ACCERR) ||
+           (si->si_code == SEGV_BNDERR)) {
+               printf("non-PK si_code, exiting...\n");
+               exit(4);
+       }
+
+       dprintf1("signal pkru from xsave: %08x\n", *pkru_ptr);
+       /* need __rdpkru() version so we do not do shadow_pkru checking */
+       dprintf1("signal pkru from  pkru: %08x\n", __rdpkru());
+       dprintf1("si_pkey from siginfo: %jx\n", si_pkey);
+       *(u64 *)pkru_ptr = 0x00000000;
+       dprintf1("WARNING: set PRKU=0 to allow faulting instruction to continue\n");
+       pkru_faults++;
+       dprintf1("<<<<==================================================\n");
+       return;
+       if (trapno == 14) {
+               fprintf(stderr,
+                       "ERROR: In signal handler, page fault, trapno = %d, ip = %016lx\n",
+                       trapno, ip);
+               fprintf(stderr, "si_addr %p\n", si->si_addr);
+               fprintf(stderr, "REG_ERR: %lx\n",
+                               (unsigned long)uctxt->uc_mcontext.gregs[REG_ERR]);
+               exit(1);
+       } else {
+               fprintf(stderr, "unexpected trap %d! at 0x%lx\n", trapno, ip);
+               fprintf(stderr, "si_addr %p\n", si->si_addr);
+               fprintf(stderr, "REG_ERR: %lx\n",
+                               (unsigned long)uctxt->uc_mcontext.gregs[REG_ERR]);
+               exit(2);
+       }
+       dprint_in_signal = 0;
+}
+
+int wait_all_children(void)
+{
+       int status;
+       return waitpid(-1, &status, 0);
+}
+
+void sig_chld(int x)
+{
+       dprint_in_signal = 1;
+       dprintf2("[%d] SIGCHLD: %d\n", getpid(), x);
+       dprint_in_signal = 0;
+}
+
+void setup_sigsegv_handler(void)
+{
+       int r, rs;
+       struct sigaction newact;
+       struct sigaction oldact;
+
+       /* #PF is mapped to sigsegv */
+       int signum  = SIGSEGV;
+
+       newact.sa_handler = 0;
+       newact.sa_sigaction = signal_handler;
+
+       /*sigset_t - signals to block while in the handler */
+       /* get the old signal mask. */
+       rs = sigprocmask(SIG_SETMASK, 0, &newact.sa_mask);
+       pkey_assert(rs == 0);
+
+       /* call sa_sigaction, not sa_handler*/
+       newact.sa_flags = SA_SIGINFO;
+
+       newact.sa_restorer = 0;  /* void(*)(), obsolete */
+       r = sigaction(signum, &newact, &oldact);
+       r = sigaction(SIGALRM, &newact, &oldact);
+       pkey_assert(r == 0);
+}
+
+void setup_handlers(void)
+{
+       signal(SIGCHLD, &sig_chld);
+       setup_sigsegv_handler();
+}
+
+pid_t fork_lazy_child(void)
+{
+       pid_t forkret;
+
+       forkret = fork();
+       pkey_assert(forkret >= 0);
+       dprintf3("[%d] fork() ret: %d\n", getpid(), forkret);
+
+       if (!forkret) {
+               /* in the child */
+               while (1) {
+                       dprintf1("child sleeping...\n");
+                       sleep(30);
+               }
+       }
+       return forkret;
+}
+
+void davecmp(void *_a, void *_b, int len)
+{
+       int i;
+       unsigned long *a = _a;
+       unsigned long *b = _b;
+
+       for (i = 0; i < len / sizeof(*a); i++) {
+               if (a[i] == b[i])
+                       continue;
+
+               dprintf3("[%3d]: a: %016lx b: %016lx\n", i, a[i], b[i]);
+       }
+}
+
+void dumpit(char *f)
+{
+       int fd = open(f, O_RDONLY);
+       char buf[100];
+       int nr_read;
+
+       dprintf2("maps fd: %d\n", fd);
+       do {
+               nr_read = read(fd, &buf[0], sizeof(buf));
+               write(1, buf, nr_read);
+       } while (nr_read > 0);
+       close(fd);
+}
+
+#define PKEY_DISABLE_ACCESS    0x1
+#define PKEY_DISABLE_WRITE     0x2
+
+u32 pkey_get(int pkey, unsigned long flags)
+{
+       u32 mask = (PKEY_DISABLE_ACCESS|PKEY_DISABLE_WRITE);
+       u32 pkru = __rdpkru();
+       u32 shifted_pkru;
+       u32 masked_pkru;
+
+       dprintf1("%s(pkey=%d, flags=%lx) = %x / %d\n",
+                       __func__, pkey, flags, 0, 0);
+       dprintf2("%s() raw pkru: %x\n", __func__, pkru);
+
+       shifted_pkru = (pkru >> (pkey * PKRU_BITS_PER_PKEY));
+       dprintf2("%s() shifted_pkru: %x\n", __func__, shifted_pkru);
+       masked_pkru = shifted_pkru & mask;
+       dprintf2("%s() masked  pkru: %x\n", __func__, masked_pkru);
+       /*
+        * shift down the relevant bits to the lowest two, then
+        * mask off all the other high bits.
+        */
+       return masked_pkru;
+}
+
+int pkey_set(int pkey, unsigned long rights, unsigned long flags)
+{
+       u32 mask = (PKEY_DISABLE_ACCESS|PKEY_DISABLE_WRITE);
+       u32 old_pkru = __rdpkru();
+       u32 new_pkru;
+
+       /* make sure that 'rights' only contains the bits we expect: */
+       assert(!(rights & ~mask));
+
+       /* copy old pkru */
+       new_pkru = old_pkru;
+       /* mask out bits from pkey in old value: */
+       new_pkru &= ~(mask << (pkey * PKRU_BITS_PER_PKEY));
+       /* OR in new bits for pkey: */
+       new_pkru |= (rights << (pkey * PKRU_BITS_PER_PKEY));
+
+       __wrpkru(new_pkru);
+
+       dprintf3("%s(pkey=%d, rights=%lx, flags=%lx) = %x pkru now: %x old_pkru: %x\n",
+                       __func__, pkey, rights, flags, 0, __rdpkru(), old_pkru);
+       return 0;
+}
+
+void pkey_disable_set(int pkey, int flags)
+{
+       unsigned long syscall_flags = 0;
+       int ret;
+       int pkey_rights;
+       u32 orig_pkru;
+
+       dprintf1("START->%s(%d, 0x%x)\n", __func__,
+               pkey, flags);
+       pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE));
+
+       pkey_rights = pkey_get(pkey, syscall_flags);
+
+       dprintf1("%s(%d) pkey_get(%d): %x\n", __func__,
+                       pkey, pkey, pkey_rights);
+       pkey_assert(pkey_rights >= 0);
+
+       pkey_rights |= flags;
+
+       ret = pkey_set(pkey, pkey_rights, syscall_flags);
+       assert(!ret);
+       /*pkru and flags have the same format */
+       shadow_pkru |= flags << (pkey * 2);
+       dprintf1("%s(%d) shadow: 0x%x\n", __func__, pkey, shadow_pkru);
+
+       pkey_assert(ret >= 0);
+
+       pkey_rights = pkey_get(pkey, syscall_flags);
+       dprintf1("%s(%d) pkey_get(%d): %x\n", __func__,
+                       pkey, pkey, pkey_rights);
+
+       dprintf1("%s(%d) pkru: 0x%x\n", __func__, pkey, rdpkru());
+       if (flags)
+               pkey_assert(rdpkru() > orig_pkru);
+       dprintf1("END<---%s(%d, 0x%x)\n", __func__,
+               pkey, flags);
+}
+
+void pkey_disable_clear(int pkey, int flags)
+{
+       unsigned long syscall_flags = 0;
+       int ret;
+       int pkey_rights = pkey_get(pkey, syscall_flags);
+       u32 orig_pkru = rdpkru();
+
+       pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE));
+
+       dprintf1("%s(%d) pkey_get(%d): %x\n", __func__,
+                       pkey, pkey, pkey_rights);
+       pkey_assert(pkey_rights >= 0);
+
+       pkey_rights |= flags;
+
+       ret = pkey_set(pkey, pkey_rights, 0);
+       /* pkru and flags have the same format */
+       shadow_pkru &= ~(flags << (pkey * 2));
+       pkey_assert(ret >= 0);
+
+       pkey_rights = pkey_get(pkey, syscall_flags);
+       dprintf1("%s(%d) pkey_get(%d): %x\n", __func__,
+                       pkey, pkey, pkey_rights);
+
+       dprintf1("%s(%d) pkru: 0x%x\n", __func__, pkey, rdpkru());
+       if (flags)
+               assert(rdpkru() > orig_pkru);
+}
+
+void pkey_write_allow(int pkey)
+{
+       pkey_disable_clear(pkey, PKEY_DISABLE_WRITE);
+}
+void pkey_write_deny(int pkey)
+{
+       pkey_disable_set(pkey, PKEY_DISABLE_WRITE);
+}
+void pkey_access_allow(int pkey)
+{
+       pkey_disable_clear(pkey, PKEY_DISABLE_ACCESS);
+}
+void pkey_access_deny(int pkey)
+{
+       pkey_disable_set(pkey, PKEY_DISABLE_ACCESS);
+}
+
+int sys_mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot,
+               unsigned long pkey)
+{
+       int sret;
+
+       dprintf2("%s(0x%p, %zx, prot=%lx, pkey=%lx)\n", __func__,
+                       ptr, size, orig_prot, pkey);
+
+       errno = 0;
+       sret = syscall(SYS_mprotect_key, ptr, size, orig_prot, pkey);
+       if (errno) {
+               dprintf2("SYS_mprotect_key sret: %d\n", sret);
+               dprintf2("SYS_mprotect_key prot: 0x%lx\n", orig_prot);
+               dprintf2("SYS_mprotect_key failed, errno: %d\n", errno);
+               if (DEBUG_LEVEL >= 2)
+                       perror("SYS_mprotect_pkey");
+       }
+       return sret;
+}
+
+int sys_pkey_alloc(unsigned long flags, unsigned long init_val)
+{
+       int ret = syscall(SYS_pkey_alloc, flags, init_val);
+       dprintf1("%s(flags=%lx, init_val=%lx) syscall ret: %d errno: %d\n",
+                       __func__, flags, init_val, ret, errno);
+       return ret;
+}
+
+int alloc_pkey(void)
+{
+       int ret;
+       unsigned long init_val = 0x0;
+
+       dprintf1("alloc_pkey()::%d, pkru: 0x%x shadow: %x\n",
+                       __LINE__, __rdpkru(), shadow_pkru);
+       ret = sys_pkey_alloc(0, init_val);
+       /*
+        * pkey_alloc() sets PKRU, so we need to reflect it in
+        * shadow_pkru:
+        */
+       dprintf4("alloc_pkey()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n",
+                       __LINE__, ret, __rdpkru(), shadow_pkru);
+       if (ret) {
+               /* clear both the bits: */
+               shadow_pkru &= ~(0x3      << (ret * 2));
+               dprintf4("alloc_pkey()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n",
+                               __LINE__, ret, __rdpkru(), shadow_pkru);
+               /*
+                * move the new state in from init_val
+                * (remember, we cheated and init_val == pkru format)
+                */
+               shadow_pkru |=  (init_val << (ret * 2));
+       }
+       dprintf4("alloc_pkey()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n",
+                       __LINE__, ret, __rdpkru(), shadow_pkru);
+       dprintf1("alloc_pkey()::%d errno: %d\n", __LINE__, errno);
+       /* for shadow checking: */
+       rdpkru();
+       dprintf4("alloc_pkey()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n",
+                       __LINE__, ret, __rdpkru(), shadow_pkru);
+       return ret;
+}
+
+int sys_pkey_free(unsigned long pkey)
+{
+       int ret = syscall(SYS_pkey_free, pkey);
+       dprintf1("%s(pkey=%ld) syscall ret: %d\n", __func__, pkey, ret);
+       return ret;
+}
+
+/*
+ * I had a bug where pkey bits could be set by mprotect() but
+ * not cleared.  This ensures we get lots of random bit sets
+ * and clears on the vma and pte pkey bits.
+ */
+int alloc_random_pkey(void)
+{
+       int max_nr_pkey_allocs;
+       int ret;
+       int i;
+       int alloced_pkeys[NR_PKEYS];
+       int nr_alloced = 0;
+       int random_index;
+       memset(alloced_pkeys, 0, sizeof(alloced_pkeys));
+
+       /* allocate every possible key and make a note of which ones we got */
+       max_nr_pkey_allocs = NR_PKEYS;
+       max_nr_pkey_allocs = 1;
+       for (i = 0; i < max_nr_pkey_allocs; i++) {
+               int new_pkey = alloc_pkey();
+               if (new_pkey < 0)
+                       break;
+               alloced_pkeys[nr_alloced++] = new_pkey;
+       }
+
+       pkey_assert(nr_alloced > 0);
+       /* select a random one out of the allocated ones */
+       random_index = rand() % nr_alloced;
+       ret = alloced_pkeys[random_index];
+       /* now zero it out so we don't free it next */
+       alloced_pkeys[random_index] = 0;
+
+       /* go through the allocated ones that we did not want and free them */
+       for (i = 0; i < nr_alloced; i++) {
+               int free_ret;
+               if (!alloced_pkeys[i])
+                       continue;
+               free_ret = sys_pkey_free(alloced_pkeys[i]);
+               pkey_assert(!free_ret);
+       }
+       dprintf1("%s()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", __func__,
+                       __LINE__, ret, __rdpkru(), shadow_pkru);
+       return ret;
+}
+
+int mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot,
+               unsigned long pkey)
+{
+       int nr_iterations = random() % 100;
+       int ret;
+
+       while (0) {
+               int rpkey = alloc_random_pkey();
+               ret = sys_mprotect_pkey(ptr, size, orig_prot, pkey);
+               dprintf1("sys_mprotect_pkey(%p, %zx, prot=0x%lx, pkey=%ld) ret: %d\n",
+                               ptr, size, orig_prot, pkey, ret);
+               if (nr_iterations-- < 0)
+                       break;
+
+               dprintf1("%s()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", __func__,
+                       __LINE__, ret, __rdpkru(), shadow_pkru);
+               sys_pkey_free(rpkey);
+               dprintf1("%s()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", __func__,
+                       __LINE__, ret, __rdpkru(), shadow_pkru);
+       }
+       pkey_assert(pkey < NR_PKEYS);
+
+       ret = sys_mprotect_pkey(ptr, size, orig_prot, pkey);
+       dprintf1("mprotect_pkey(%p, %zx, prot=0x%lx, pkey=%ld) ret: %d\n",
+                       ptr, size, orig_prot, pkey, ret);
+       pkey_assert(!ret);
+       dprintf1("%s()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", __func__,
+                       __LINE__, ret, __rdpkru(), shadow_pkru);
+       return ret;
+}
+
+struct pkey_malloc_record {
+       void *ptr;
+       long size;
+};
+struct pkey_malloc_record *pkey_malloc_records;
+long nr_pkey_malloc_records;
+void record_pkey_malloc(void *ptr, long size)
+{
+       long i;
+       struct pkey_malloc_record *rec = NULL;
+
+       for (i = 0; i < nr_pkey_malloc_records; i++) {
+               rec = &pkey_malloc_records[i];
+               /* find a free record */
+               if (rec)
+                       break;
+       }
+       if (!rec) {
+               /* every record is full */
+               size_t old_nr_records = nr_pkey_malloc_records;
+               size_t new_nr_records = (nr_pkey_malloc_records * 2 + 1);
+               size_t new_size = new_nr_records * sizeof(struct pkey_malloc_record);
+               dprintf2("new_nr_records: %zd\n", new_nr_records);
+               dprintf2("new_size: %zd\n", new_size);
+               pkey_malloc_records = realloc(pkey_malloc_records, new_size);
+               pkey_assert(pkey_malloc_records != NULL);
+               rec = &pkey_malloc_records[nr_pkey_malloc_records];
+               /*
+                * realloc() does not initialize memory, so zero it from
+                * the first new record all the way to the end.
+                */
+               for (i = 0; i < new_nr_records - old_nr_records; i++)
+                       memset(rec + i, 0, sizeof(*rec));
+       }
+       dprintf3("filling malloc record[%d/%p]: {%p, %ld}\n",
+               (int)(rec - pkey_malloc_records), rec, ptr, size);
+       rec->ptr = ptr;
+       rec->size = size;
+       nr_pkey_malloc_records++;
+}
+
+void free_pkey_malloc(void *ptr)
+{
+       long i;
+       int ret;
+       dprintf3("%s(%p)\n", __func__, ptr);
+       for (i = 0; i < nr_pkey_malloc_records; i++) {
+               struct pkey_malloc_record *rec = &pkey_malloc_records[i];
+               dprintf4("looking for ptr %p at record[%ld/%p]: {%p, %ld}\n",
+                               ptr, i, rec, rec->ptr, rec->size);
+               if ((ptr <  rec->ptr) ||
+                   (ptr >= rec->ptr + rec->size))
+                       continue;
+
+               dprintf3("found ptr %p at record[%ld/%p]: {%p, %ld}\n",
+                               ptr, i, rec, rec->ptr, rec->size);
+               nr_pkey_malloc_records--;
+               ret = munmap(rec->ptr, rec->size);
+               dprintf3("munmap ret: %d\n", ret);
+               pkey_assert(!ret);
+               dprintf3("clearing rec->ptr, rec: %p\n", rec);
+               rec->ptr = NULL;
+               dprintf3("done clearing rec->ptr, rec: %p\n", rec);
+               return;
+       }
+       pkey_assert(false);
+}
+
+
+void *malloc_pkey_with_mprotect(long size, int prot, u16 pkey)
+{
+       void *ptr;
+       int ret;
+
+       rdpkru();
+       dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__,
+                       size, prot, pkey);
+       pkey_assert(pkey < NR_PKEYS);
+       ptr = mmap(NULL, size, prot, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
+       pkey_assert(ptr != (void *)-1);
+       ret = mprotect_pkey((void *)ptr, PAGE_SIZE, prot, pkey);
+       pkey_assert(!ret);
+       record_pkey_malloc(ptr, size);
+       rdpkru();
+
+       dprintf1("%s() for pkey %d @ %p\n", __func__, pkey, ptr);
+       return ptr;
+}
+
+void *malloc_pkey_anon_huge(long size, int prot, u16 pkey)
+{
+       int ret;
+       void *ptr;
+
+       dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__,
+                       size, prot, pkey);
+       /*
+        * Guarantee we can fit at least one huge page in the resulting
+        * allocation by allocating space for 2:
+        */
+       size = ALIGN_UP(size, HPAGE_SIZE * 2);
+       ptr = mmap(NULL, size, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
+       pkey_assert(ptr != (void *)-1);
+       record_pkey_malloc(ptr, size);
+       mprotect_pkey(ptr, size, prot, pkey);
+
+       dprintf1("unaligned ptr: %p\n", ptr);
+       ptr = ALIGN_PTR_UP(ptr, HPAGE_SIZE);
+       dprintf1("  aligned ptr: %p\n", ptr);
+       ret = madvise(ptr, HPAGE_SIZE, MADV_HUGEPAGE);
+       dprintf1("MADV_HUGEPAGE ret: %d\n", ret);
+       ret = madvise(ptr, HPAGE_SIZE, MADV_WILLNEED);
+       dprintf1("MADV_WILLNEED ret: %d\n", ret);
+       memset(ptr, 0, HPAGE_SIZE);
+
+       dprintf1("mmap()'d thp for pkey %d @ %p\n", pkey, ptr);
+       return ptr;
+}
+
+int hugetlb_setup_ok;
+#define GET_NR_HUGE_PAGES 10
+void setup_hugetlbfs(void)
+{
+       int err;
+       int fd;
+       int validated_nr_pages;
+       int i;
+       char buf[] = "123";
+
+       if (geteuid() != 0) {
+               fprintf(stderr, "WARNING: not run as root, can not do hugetlb test\n");
+               return;
+       }
+
+       cat_into_file(__stringify(GET_NR_HUGE_PAGES), "/proc/sys/vm/nr_hugepages");
+
+       /*
+        * Now go make sure that we got the pages and that they
+        * are 2M pages.  Someone might have made 1G the default.
+        */
+       fd = open("/sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages", O_RDONLY);
+       if (fd < 0) {
+               perror("opening sysfs 2M hugetlb config");
+               return;
+       }
+
+       /* -1 to guarantee leaving the trailing \0 */
+       err = read(fd, buf, sizeof(buf)-1);
+       close(fd);
+       if (err <= 0) {
+               perror("reading sysfs 2M hugetlb config");
+               return;
+       }
+
+       if (atoi(buf) != GET_NR_HUGE_PAGES) {
+               fprintf(stderr, "could not confirm 2M pages, got: '%s' expected %d\n",
+                       buf, GET_NR_HUGE_PAGES);
+               return;
+       }
+
+       hugetlb_setup_ok = 1;
+}
+
+void *malloc_pkey_hugetlb(long size, int prot, u16 pkey)
+{
+       void *ptr;
+       int flags = MAP_ANONYMOUS|MAP_PRIVATE|MAP_HUGETLB;
+
+       if (!hugetlb_setup_ok)
+               return PTR_ERR_ENOTSUP;
+
+       dprintf1("doing %s(%ld, %x, %x)\n", __func__, size, prot, pkey);
+       size = ALIGN_UP(size, HPAGE_SIZE * 2);
+       pkey_assert(pkey < NR_PKEYS);
+       ptr = mmap(NULL, size, PROT_NONE, flags, -1, 0);
+       pkey_assert(ptr != (void *)-1);
+       mprotect_pkey(ptr, size, prot, pkey);
+
+       record_pkey_malloc(ptr, size);
+
+       dprintf1("mmap()'d hugetlbfs for pkey %d @ %p\n", pkey, ptr);
+       return ptr;
+}
+
+void *malloc_pkey_mmap_dax(long size, int prot, u16 pkey)
+{
+       void *ptr;
+       int fd;
+
+       dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__,
+                       size, prot, pkey);
+       pkey_assert(pkey < NR_PKEYS);
+       fd = open("/dax/foo", O_RDWR);
+       pkey_assert(fd >= 0);
+
+       ptr = mmap(0, size, prot, MAP_SHARED, fd, 0);
+       pkey_assert(ptr != (void *)-1);
+
+       mprotect_pkey(ptr, size, prot, pkey);
+
+       record_pkey_malloc(ptr, size);
+
+       dprintf1("mmap()'d for pkey %d @ %p\n", pkey, ptr);
+       close(fd);
+       return ptr;
+}
+
+void *(*pkey_malloc[])(long size, int prot, u16 pkey) = {
+
+       malloc_pkey_with_mprotect,
+       malloc_pkey_anon_huge,
+       malloc_pkey_hugetlb
+/* can not do direct with the pkey_mprotect() API:
+       malloc_pkey_mmap_direct,
+       malloc_pkey_mmap_dax,
+*/
+};
+
+void *malloc_pkey(long size, int prot, u16 pkey)
+{
+       void *ret;
+       static int malloc_type;
+       int nr_malloc_types = ARRAY_SIZE(pkey_malloc);
+
+       pkey_assert(pkey < NR_PKEYS);
+
+       while (1) {
+               pkey_assert(malloc_type < nr_malloc_types);
+
+               ret = pkey_malloc[malloc_type](size, prot, pkey);
+               pkey_assert(ret != (void *)-1);
+
+               malloc_type++;
+               if (malloc_type >= nr_malloc_types)
+                       malloc_type = (random()%nr_malloc_types);
+
+               /* try again if the malloc_type we tried is unsupported */
+               if (ret == PTR_ERR_ENOTSUP)
+                       continue;
+
+               break;
+       }
+
+       dprintf3("%s(%ld, prot=%x, pkey=%x) returning: %p\n", __func__,
+                       size, prot, pkey, ret);
+       return ret;
+}
+
+int last_pkru_faults;
+void expected_pk_fault(int pkey)
+{
+       dprintf2("%s(): last_pkru_faults: %d pkru_faults: %d\n",
+                       __func__, last_pkru_faults, pkru_faults);
+       dprintf2("%s(%d): last_si_pkey: %d\n", __func__, pkey, last_si_pkey);
+       pkey_assert(last_pkru_faults + 1 == pkru_faults);
+       pkey_assert(last_si_pkey == pkey);
+       /*
+        * The signal handler shold have cleared out PKRU to let the
+        * test program continue.  We now have to restore it.
+        */
+       if (__rdpkru() != 0)
+               pkey_assert(0);
+
+       __wrpkru(shadow_pkru);
+       dprintf1("%s() set PKRU=%x to restore state after signal nuked it\n",
+                       __func__, shadow_pkru);
+       last_pkru_faults = pkru_faults;
+       last_si_pkey = -1;
+}
+
+void do_not_expect_pk_fault(void)
+{
+       pkey_assert(last_pkru_faults == pkru_faults);
+}
+
+int test_fds[10] = { -1 };
+int nr_test_fds;
+void __save_test_fd(int fd)
+{
+       pkey_assert(fd >= 0);
+       pkey_assert(nr_test_fds < ARRAY_SIZE(test_fds));
+       test_fds[nr_test_fds] = fd;
+       nr_test_fds++;
+}
+
+int get_test_read_fd(void)
+{
+       int test_fd = open("/etc/passwd", O_RDONLY);
+       __save_test_fd(test_fd);
+       return test_fd;
+}
+
+void close_test_fds(void)
+{
+       int i;
+
+       for (i = 0; i < nr_test_fds; i++) {
+               if (test_fds[i] < 0)
+                       continue;
+               close(test_fds[i]);
+               test_fds[i] = -1;
+       }
+       nr_test_fds = 0;
+}
+
+#define barrier() __asm__ __volatile__("": : :"memory")
+__attribute__((noinline)) int read_ptr(int *ptr)
+{
+       /*
+        * Keep GCC from optimizing this away somehow
+        */
+       barrier();
+       return *ptr;
+}
+
+void test_read_of_write_disabled_region(int *ptr, u16 pkey)
+{
+       int ptr_contents;
+
+       dprintf1("disabling write access to PKEY[1], doing read\n");
+       pkey_write_deny(pkey);
+       ptr_contents = read_ptr(ptr);
+       dprintf1("*ptr: %d\n", ptr_contents);
+       dprintf1("\n");
+}
+void test_read_of_access_disabled_region(int *ptr, u16 pkey)
+{
+       int ptr_contents;
+
+       dprintf1("disabling access to PKEY[%02d], doing read @ %p\n", pkey, ptr);
+       rdpkru();
+       pkey_access_deny(pkey);
+       ptr_contents = read_ptr(ptr);
+       dprintf1("*ptr: %d\n", ptr_contents);
+       expected_pk_fault(pkey);
+}
+void test_write_of_write_disabled_region(int *ptr, u16 pkey)
+{
+       dprintf1("disabling write access to PKEY[%02d], doing write\n", pkey);
+       pkey_write_deny(pkey);
+       *ptr = __LINE__;
+       expected_pk_fault(pkey);
+}
+void test_write_of_access_disabled_region(int *ptr, u16 pkey)
+{
+       dprintf1("disabling access to PKEY[%02d], doing write\n", pkey);
+       pkey_access_deny(pkey);
+       *ptr = __LINE__;
+       expected_pk_fault(pkey);
+}
+void test_kernel_write_of_access_disabled_region(int *ptr, u16 pkey)
+{
+       int ret;
+       int test_fd = get_test_read_fd();
+
+       dprintf1("disabling access to PKEY[%02d], "
+                "having kernel read() to buffer\n", pkey);
+       pkey_access_deny(pkey);
+       ret = read(test_fd, ptr, 1);
+       dprintf1("read ret: %d\n", ret);
+       pkey_assert(ret);
+}
+void test_kernel_write_of_write_disabled_region(int *ptr, u16 pkey)
+{
+       int ret;
+       int test_fd = get_test_read_fd();
+
+       pkey_write_deny(pkey);
+       ret = read(test_fd, ptr, 100);
+       dprintf1("read ret: %d\n", ret);
+       if (ret < 0 && (DEBUG_LEVEL > 0))
+               perror("verbose read result (OK for this to be bad)");
+       pkey_assert(ret);
+}
+
+void test_kernel_gup_of_access_disabled_region(int *ptr, u16 pkey)
+{
+       int pipe_ret, vmsplice_ret;
+       struct iovec iov;
+       int pipe_fds[2];
+
+       pipe_ret = pipe(pipe_fds);
+
+       pkey_assert(pipe_ret == 0);
+       dprintf1("disabling access to PKEY[%02d], "
+                "having kernel vmsplice from buffer\n", pkey);
+       pkey_access_deny(pkey);
+       iov.iov_base = ptr;
+       iov.iov_len = PAGE_SIZE;
+       vmsplice_ret = vmsplice(pipe_fds[1], &iov, 1, SPLICE_F_GIFT);
+       dprintf1("vmsplice() ret: %d\n", vmsplice_ret);
+       pkey_assert(vmsplice_ret == -1);
+
+       close(pipe_fds[0]);
+       close(pipe_fds[1]);
+}
+
+void test_kernel_gup_write_to_write_disabled_region(int *ptr, u16 pkey)
+{
+       int ignored = 0xdada;
+       int futex_ret;
+       int some_int = __LINE__;
+
+       dprintf1("disabling write to PKEY[%02d], "
+                "doing futex gunk in buffer\n", pkey);
+       *ptr = some_int;
+       pkey_write_deny(pkey);
+       futex_ret = syscall(SYS_futex, ptr, FUTEX_WAIT, some_int-1, NULL,
+                       &ignored, ignored);
+       if (DEBUG_LEVEL > 0)
+               perror("futex");
+       dprintf1("futex() ret: %d\n", futex_ret);
+}
+
+/* Assumes that all pkeys other than 'pkey' are unallocated */
+void test_pkey_syscalls_on_non_allocated_pkey(int *ptr, u16 pkey)
+{
+       int err;
+       int i;
+
+       /* Note: 0 is the default pkey, so don't mess with it */
+       for (i = 1; i < NR_PKEYS; i++) {
+               if (pkey == i)
+                       continue;
+
+               dprintf1("trying get/set/free to non-allocated pkey: %2d\n", i);
+               err = sys_pkey_free(i);
+               pkey_assert(err);
+
+               /* not enforced when pkey_get() is not a syscall
+               err = pkey_get(i, 0);
+               pkey_assert(err < 0);
+               */
+
+               err = sys_pkey_free(i);
+               pkey_assert(err);
+
+               err = sys_mprotect_pkey(ptr, PAGE_SIZE, PROT_READ, i);
+               pkey_assert(err);
+       }
+}
+
+/* Assumes that all pkeys other than 'pkey' are unallocated */
+void test_pkey_syscalls_bad_args(int *ptr, u16 pkey)
+{
+       int err;
+       int bad_flag = (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE) + 1;
+       int bad_pkey = NR_PKEYS+99;
+
+       /* not enforced when pkey_get() is not a syscall
+       err = pkey_get(bad_pkey, bad_flag);
+       pkey_assert(err < 0);
+       */
+
+       /* pass a known-invalid pkey in: */
+       err = sys_mprotect_pkey(ptr, PAGE_SIZE, PROT_READ, bad_pkey);
+       pkey_assert(err);
+}
+
+/* Assumes that all pkeys other than 'pkey' are unallocated */
+void test_pkey_alloc_exhaust(int *ptr, u16 pkey)
+{
+       unsigned long flags;
+       unsigned long init_val;
+       int err;
+       int allocated_pkeys[NR_PKEYS] = {0};
+       int nr_allocated_pkeys = 0;
+       int i;
+
+       for (i = 0; i < NR_PKEYS*2; i++) {
+               int new_pkey;
+               dprintf1("%s() alloc loop: %d\n", __func__, i);
+               new_pkey = alloc_pkey();
+               dprintf4("%s()::%d, err: %d pkru: 0x%x shadow: 0x%x\n", __func__,
+                               __LINE__, err, __rdpkru(), shadow_pkru);
+               rdpkru(); /* for shadow checking */
+               dprintf2("%s() errno: %d ENOSPC: %d\n", __func__, errno, ENOSPC);
+               if ((new_pkey == -1) && (errno == ENOSPC)) {
+                       dprintf2("%s() failed to allocate pkey after %d tries\n",
+                               __func__, nr_allocated_pkeys);
+                       break;
+               }
+               pkey_assert(nr_allocated_pkeys < NR_PKEYS);
+               allocated_pkeys[nr_allocated_pkeys++] = new_pkey;
+       }
+
+       dprintf3("%s()::%d\n", __func__, __LINE__);
+
+       /*
+        * ensure it did not reach the end of the loop without
+        * failure:
+        */
+       pkey_assert(i < NR_PKEYS*2);
+
+       /*
+        * There are 16 pkeys supported in hardware.  One is taken
+        * up for the default (0) and another can be taken up by
+        * an execute-only mapping.  Ensure that we can allocate
+        * at least 14 (16-2).
+        */
+       pkey_assert(i >= NR_PKEYS-2);
+
+       for (i = 0; i < nr_allocated_pkeys; i++) {
+               err = sys_pkey_free(allocated_pkeys[i]);
+               pkey_assert(!err);
+               rdpkru(); /* for shadow checking */
+       }
+}
+
+void test_ptrace_of_child(int *ptr, u16 pkey)
+{
+       __attribute__((__unused__)) int peek_result;
+       pid_t child_pid;
+       void *ignored = 0;
+       long ret;
+       int status;
+       /*
+        * This is the "control" for our little expermient.  Make sure
+        * we can always access it when ptracing.
+        */
+       int *plain_ptr_unaligned = malloc(HPAGE_SIZE);
+       int *plain_ptr = ALIGN_PTR_UP(plain_ptr_unaligned, PAGE_SIZE);
+
+       /*
+        * Fork a child which is an exact copy of this process, of course.
+        * That means we can do all of our tests via ptrace() and then plain
+        * memory access and ensure they work differently.
+        */
+       child_pid = fork_lazy_child();
+       dprintf1("[%d] child pid: %d\n", getpid(), child_pid);
+
+       ret = ptrace(PTRACE_ATTACH, child_pid, ignored, ignored);
+       if (ret)
+               perror("attach");
+       dprintf1("[%d] attach ret: %ld %d\n", getpid(), ret, __LINE__);
+       pkey_assert(ret != -1);
+       ret = waitpid(child_pid, &status, WUNTRACED);
+       if ((ret != child_pid) || !(WIFSTOPPED(status))) {
+               fprintf(stderr, "weird waitpid result %ld stat %x\n",
+                               ret, status);
+               pkey_assert(0);
+       }
+       dprintf2("waitpid ret: %ld\n", ret);
+       dprintf2("waitpid status: %d\n", status);
+
+       pkey_access_deny(pkey);
+       pkey_write_deny(pkey);
+
+       /* Write access, untested for now:
+       ret = ptrace(PTRACE_POKEDATA, child_pid, peek_at, data);
+       pkey_assert(ret != -1);
+       dprintf1("poke at %p: %ld\n", peek_at, ret);
+       */
+
+       /*
+        * Try to access the pkey-protected "ptr" via ptrace:
+        */
+       ret = ptrace(PTRACE_PEEKDATA, child_pid, ptr, ignored);
+       /* expect it to work, without an error: */
+       pkey_assert(ret != -1);
+       /* Now access from the current task, and expect an exception: */
+       peek_result = read_ptr(ptr);
+       expected_pk_fault(pkey);
+
+       /*
+        * Try to access the NON-pkey-protected "plain_ptr" via ptrace:
+        */
+       ret = ptrace(PTRACE_PEEKDATA, child_pid, plain_ptr, ignored);
+       /* expect it to work, without an error: */
+       pkey_assert(ret != -1);
+       /* Now access from the current task, and expect NO exception: */
+       peek_result = read_ptr(plain_ptr);
+       do_not_expect_pk_fault();
+
+       ret = ptrace(PTRACE_DETACH, child_pid, ignored, 0);
+       pkey_assert(ret != -1);
+
+       ret = kill(child_pid, SIGKILL);
+       pkey_assert(ret != -1);
+
+       wait(&status);
+
+       free(plain_ptr_unaligned);
+}
+
+void test_executing_on_unreadable_memory(int *ptr, u16 pkey)
+{
+       void *p1;
+       int scratch;
+       int ptr_contents;
+       int ret;
+
+       p1 = ALIGN_PTR_UP(&lots_o_noops_around_write, PAGE_SIZE);
+       dprintf3("&lots_o_noops: %p\n", &lots_o_noops_around_write);
+       /* lots_o_noops_around_write should be page-aligned already */
+       assert(p1 == &lots_o_noops_around_write);
+
+       /* Point 'p1' at the *second* page of the function: */
+       p1 += PAGE_SIZE;
+
+       madvise(p1, PAGE_SIZE, MADV_DONTNEED);
+       lots_o_noops_around_write(&scratch);
+       ptr_contents = read_ptr(p1);
+       dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents);
+
+       ret = mprotect_pkey(p1, PAGE_SIZE, PROT_EXEC, (u64)pkey);
+       pkey_assert(!ret);
+       pkey_access_deny(pkey);
+
+       dprintf2("pkru: %x\n", rdpkru());
+
+       /*
+        * Make sure this is an *instruction* fault
+        */
+       madvise(p1, PAGE_SIZE, MADV_DONTNEED);
+       lots_o_noops_around_write(&scratch);
+       do_not_expect_pk_fault();
+       ptr_contents = read_ptr(p1);
+       dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents);
+       expected_pk_fault(pkey);
+}
+
+void test_mprotect_pkey_on_unsupported_cpu(int *ptr, u16 pkey)
+{
+       int size = PAGE_SIZE;
+       int sret;
+
+       if (cpu_has_pku()) {
+               dprintf1("SKIP: %s: no CPU support\n", __func__);
+               return;
+       }
+
+       sret = syscall(SYS_mprotect_key, ptr, size, PROT_READ, pkey);
+       pkey_assert(sret < 0);
+}
+
+void (*pkey_tests[])(int *ptr, u16 pkey) = {
+       test_read_of_write_disabled_region,
+       test_read_of_access_disabled_region,
+       test_write_of_write_disabled_region,
+       test_write_of_access_disabled_region,
+       test_kernel_write_of_access_disabled_region,
+       test_kernel_write_of_write_disabled_region,
+       test_kernel_gup_of_access_disabled_region,
+       test_kernel_gup_write_to_write_disabled_region,
+       test_executing_on_unreadable_memory,
+       test_ptrace_of_child,
+       test_pkey_syscalls_on_non_allocated_pkey,
+       test_pkey_syscalls_bad_args,
+       test_pkey_alloc_exhaust,
+};
+
+void run_tests_once(void)
+{
+       int *ptr;
+       int prot = PROT_READ|PROT_WRITE;
+
+       for (test_nr = 0; test_nr < ARRAY_SIZE(pkey_tests); test_nr++) {
+               int pkey;
+               int orig_pkru_faults = pkru_faults;
+
+               dprintf1("======================\n");
+               dprintf1("test %d preparing...\n", test_nr);
+
+               tracing_on();
+               pkey = alloc_random_pkey();
+               dprintf1("test %d starting with pkey: %d\n", test_nr, pkey);
+               ptr = malloc_pkey(PAGE_SIZE, prot, pkey);
+               dprintf1("test %d starting...\n", test_nr);
+               pkey_tests[test_nr](ptr, pkey);
+               dprintf1("freeing test memory: %p\n", ptr);
+               free_pkey_malloc(ptr);
+               sys_pkey_free(pkey);
+
+               dprintf1("pkru_faults: %d\n", pkru_faults);
+               dprintf1("orig_pkru_faults: %d\n", orig_pkru_faults);
+
+               tracing_off();
+               close_test_fds();
+
+               printf("test %2d PASSED (itertation %d)\n", test_nr, iteration_nr);
+               dprintf1("======================\n\n");
+       }
+       iteration_nr++;
+}
+
+void pkey_setup_shadow(void)
+{
+       shadow_pkru = __rdpkru();
+}
+
+int main(void)
+{
+       int nr_iterations = 22;
+
+       setup_handlers();
+
+       printf("has pku: %d\n", cpu_has_pku());
+
+       if (!cpu_has_pku()) {
+               int size = PAGE_SIZE;
+               int *ptr;
+
+               printf("running PKEY tests for unsupported CPU/OS\n");
+
+               ptr  = mmap(NULL, size, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
+               assert(ptr != (void *)-1);
+               test_mprotect_pkey_on_unsupported_cpu(ptr, 1);
+               exit(0);
+       }
+
+       pkey_setup_shadow();
+       printf("startup pkru: %x\n", rdpkru());
+       setup_hugetlbfs();
+
+       while (nr_iterations-- > 0)
+               run_tests_once();
+
+       printf("done (all tests OK)\n");
+       return 0;
+}
index 421456784bc64d7c2ad5b14753194bcf89905d1d..b037ce9cf116b1da0ef57601f8f1840e45fcc775 100644 (file)
@@ -147,7 +147,7 @@ static void test_sys32_regs(void (*do_syscall)(struct syscall_args32 *))
        if (args.nr != getpid() ||
            args.arg0 != 10 || args.arg1 != 11 || args.arg2 != 12 ||
            args.arg3 != 13 || args.arg4 != 14 || args.arg5 != 15) {
-               printf("[FAIL]\tgetpid() failed to preseve regs\n");
+               printf("[FAIL]\tgetpid() failed to preserve regs\n");
                nerrs++;
        } else {
                printf("[OK]\tgetpid() preserves regs\n");
@@ -162,7 +162,7 @@ static void test_sys32_regs(void (*do_syscall)(struct syscall_args32 *))
        if (args.nr != 0 ||
            args.arg0 != getpid() || args.arg1 != SIGUSR1 || args.arg2 != 12 ||
            args.arg3 != 13 || args.arg4 != 14 || args.arg5 != 15) {
-               printf("[FAIL]\tkill(getpid(), SIGUSR1) failed to preseve regs\n");
+               printf("[FAIL]\tkill(getpid(), SIGUSR1) failed to preserve regs\n");
                nerrs++;
        } else {
                printf("[OK]\tkill(getpid(), SIGUSR1) preserves regs\n");
This page took 0.446997 seconds and 5 git commands to generate.