--- /dev/null
+*.c diff=cpp
+*.h diff=cpp
NOTE: At the moment DMA_ATTR_ALLOC_SINGLE_PAGES is only implemented on ARM,
though ARM64 patches will likely be posted soon.
+
+DMA_ATTR_NO_WARN
+----------------
+
+This tells the DMA-mapping subsystem to suppress allocation failure reports
+(similarly to __GFP_NOWARN).
+
+On some architectures allocation failures are reported with error messages
+to the system logs. Although this can help to identify and debug problems,
+drivers which handle failures (eg, retry later) have no problems with them,
+and can actually flood the system logs with error messages that aren't any
+problem at all, depending on the implementation of the retry mechanism.
+
+So, this provides a way for drivers to avoid those error messages on calls
+where allocation failures are not a problem, and shouldn't bother the logs.
+
+NOTE: At the moment DMA_ATTR_NO_WARN is only implemented on PowerPC.
* including this struct */
__s32 ioctlfd; /* automount command fd */
- __u32 arg1; /* Command parameters */
- __u32 arg2;
+ union {
+ struct args_protover protover;
+ struct args_protosubver protosubver;
+ struct args_openmount openmount;
+ struct args_ready ready;
+ struct args_fail fail;
+ struct args_setpipefd setpipefd;
+ struct args_timeout timeout;
+ struct args_requester requester;
+ struct args_expire expire;
+ struct args_askumount askumount;
+ struct args_ismountpoint ismountpoint;
+ };
char path[0];
};
mount point file descriptor, and when requesting the uid and gid of the
last successful mount on a directory within the autofs file system.
-The fields arg1 and arg2 are used to communicate parameters and results of
-calls made as described below.
+The union is used to communicate parameters and results of calls made
+as described below.
The path field is used to pass a path where it is needed and the size field
is used account for the increased structure length when translating the
Get the major and minor version of the autofs4 protocol version understood
by loaded module. This call requires an initialized struct autofs_dev_ioctl
with the ioctlfd field set to a valid autofs mount point descriptor
-and sets the requested version number in structure field arg1. These
-commands return 0 on success or one of the negative error codes if
-validation fails.
+and sets the requested version number in version field of struct args_protover
+or sub_version field of struct args_protosubver. These commands return
+0 on success or one of the negative error codes if validation fails.
AUTOFS_DEV_IOCTL_OPENMOUNT and AUTOFS_DEV_IOCTL_CLOSEMOUNT
Obtain and release a file descriptor for an autofs managed mount point
path. The open call requires an initialized struct autofs_dev_ioctl with
the path field set and the size field adjusted appropriately as well
-as the arg1 field set to the device number of the autofs mount. The
-device number can be obtained from the mount options shown in
-/proc/mounts. The close call requires an initialized struct
+as the devid field of struct args_openmount set to the device number of
+the autofs mount. The device number can be obtained from the mount options
+shown in /proc/mounts. The close call requires an initialized struct
autofs_dev_ioct with the ioctlfd field set to the descriptor obtained
from the open call. The release of the file descriptor can also be done
with close(2) so any open descriptors will also be closed at process exit.
Return mount and expire result status from user space to the kernel.
Both of these calls require an initialized struct autofs_dev_ioctl
with the ioctlfd field set to the descriptor obtained from the open
-call and the arg1 field set to the wait queue token number, received
-by user space in the foregoing mount or expire request. The arg2 field
-is set to the status to be returned. For the ready call this is always
-0 and for the fail call it is set to the errno of the operation.
+call and the token field of struct args_ready or struct args_fail set
+to the wait queue token number, received by user space in the foregoing
+mount or expire request. The status field of struct args_fail is set to
+the errno of the operation. It is set to 0 on success.
AUTOFS_DEV_IOCTL_SETPIPEFD_CMD
The call requires an initialized struct autofs_dev_ioctl with the
ioctlfd field set to the descriptor obtained from the open call and
-the arg1 field set to descriptor of the pipe. On success the call
-also sets the process group id used to identify the controlling process
-(eg. the owning automount(8) daemon) to the process group of the caller.
+the pipefd field of struct args_setpipefd set to descriptor of the pipe.
+On success the call also sets the process group id used to identify the
+controlling process (eg. the owning automount(8) daemon) to the process
+group of the caller.
AUTOFS_DEV_IOCTL_CATATONIC_CMD
The call requires an initialized struct autofs_dev_ioctl with the path
field set to the mount point in question and the size field adjusted
-appropriately as well as the arg1 field set to the device number of the
-containing autofs mount. Upon return the struct field arg1 contains the
-uid and arg2 the gid.
+appropriately. Upon return the uid field of struct args_requester contains
+the uid and gid field the gid.
When reconstructing an autofs mount tree with active mounts we need to
re-connect to mounts that may have used the original process uid and
The call requires an initialized struct autofs_dev_ioctl with the
ioctlfd field set to the descriptor obtained from the open call. In
addition an immediate expire, independent of the mount timeout, can be
-requested by setting the arg1 field to 1. If no expire candidates can
-be found the ioctl returns -1 with errno set to EAGAIN.
+requested by setting the how field of struct args_expire to 1. If no
+expire candidates can be found the ioctl returns -1 with errno set to
+EAGAIN.
This call causes the kernel module to check the mount corresponding
to the given ioctlfd for mounts that can be expired, issues an expire
The call requires an initialized struct autofs_dev_ioctl with the
ioctlfd field set to the descriptor obtained from the open call and
-it returns the result in the arg1 field, 1 for busy and 0 otherwise.
+it returns the result in the may_umount field of struct args_askumount,
+1 for busy and 0 otherwise.
AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD
possible variations. Both use the path field set to the path of the mount
point to check and the size field adjusted appropriately. One uses the
ioctlfd field to identify a specific mount point to check while the other
-variation uses the path and optionally arg1 set to an autofs mount type.
-The call returns 1 if this is a mount point and sets arg1 to the device
-number of the mount and field arg2 to the relevant super block magic
-number (described below) or 0 if it isn't a mountpoint. In both cases
-the the device number (as returned by new_encode_dev()) is returned
-in field arg1.
+variation uses the path and optionally in.type field of struct args_ismountpoint
+set to an autofs mount type. The call returns 1 if this is a mount point
+and sets out.devid field to the device number of the mount and out.magic
+field to the relevant super block magic number (described below) or 0 if
+it isn't a mountpoint. In both cases the the device number (as returned
+by new_encode_dev()) is returned in out.devid field.
If supplied with a file descriptor we're looking for a specific mount,
not necessarily at the top of the mounted stack. In this case the path
Mountpoint expiry
-----------------
-The VFS has a mechansim for automatically expiring unused mounts,
+The VFS has a mechanism for automatically expiring unused mounts,
much as it can expire any unused dentry information from the dcache.
-This is guided by the MNT_SHRINKABLE flag. This only applies to
+This is guided by the MNT_SHRINKABLE flag. This only applies to
mounts that were created by `d_automount()` returning a filesystem to be
mounted. As autofs doesn't return such a filesystem but leaves the
mounting to the automount daemon, it must involve the automount daemon
autofs knows whether a process requesting some operation is the daemon
or not based on its process-group id number (see getpgid(1)).
-When an autofs filesystem it mounted the pgid of the mounting
+When an autofs filesystem is mounted the pgid of the mounting
processes is recorded unless the "pgrp=" option is given, in which
case that number is recorded instead. Any request arriving from a
process in that process group is considered to come from the daemon.
numbers for existing filesystems can be found in
`/proc/self/mountinfo`.
- **AUTOFS_DEV_IOCTL_CLOSEMOUNT_CMD**: same as `close(ioctlfd)`.
-- **AUTOFS_DEV_IOCTL_SETPIPEFD_CMD**: if the filesystem is in
+- **AUTOFS_DEV_IOCTL_SETPIPEFD_CMD**: if the filesystem is in
catatonic mode, this can provide the write end of a new pipe
in `arg1` to re-establish communication with a daemon. The
process group of the calling process is used to identify the
# 4) Check for missing system calls
# 5) Generate constants.py (may need bounds.h)
-# Default sed regexp - multiline due to syntax constraints
-define sed-y
- "/^->/{s:->#\(.*\):/* \1 */:; \
- s:^->\([^ ]*\) [\$$#]*\([-0-9]*\) \(.*\):#define \1 \2 /* \3 */:; \
- s:^->\([^ ]*\) [\$$#]*\([^ ]*\) \(.*\):#define \1 \2 /* \3 */:; \
- s:->::; p;}"
-endef
-
# Use filechk to avoid rebuilds when a header changes, but the resulting file
# does not
define filechk_offsets
echo " *"; \
echo " * This file was generated by Kbuild"; \
echo " */"; \
- echo ""; \
- sed -ne $(sed-y); \
- echo ""; \
+ sed -ne '/#define/{s/\$$//;s/#//2;s/$$/*\//;p;}'; \
echo "#endif" )
endef
F: drivers/cpufreq/intel_pstate.c
INTEL FRAMEBUFFER DRIVER (excluding 810 and 815)
-M: Maik Broemme <mbroemme@plusserver.de>
+M: Maik Broemme <mbroemme@libmpq.org>
L: linux-fbdev@vger.kernel.org
S: Maintained
F: Documentation/fb/intelfb.txt
F: net/8021q/
VLYNQ BUS
-M: Florian Fainelli <florian@openwrt.org>
+M: Florian Fainelli <f.fainelli@gmail.com>
L: openwrt-devel@lists.openwrt.org (subscribers-only)
S: Maintained
F: drivers/vlynq/vlynq.c
HEAD_TEXT
TEXT_TEXT
SCHED_TEXT
+ CPUIDLE_TEXT
LOCK_TEXT
*(.fixup)
*(.gnu.warning)
_text = .;
TEXT_TEXT
SCHED_TEXT
+ CPUIDLE_TEXT
LOCK_TEXT
KPROBES_TEXT
*(.fixup)
#endif
#ifdef CONFIG_SMP
-extern void arch_trigger_all_cpu_backtrace(bool);
-#define arch_trigger_all_cpu_backtrace(x) arch_trigger_all_cpu_backtrace(x)
+extern void arch_trigger_cpumask_backtrace(const cpumask_t *mask,
+ bool exclude_self);
+#define arch_trigger_cpumask_backtrace arch_trigger_cpumask_backtrace
#endif
static inline int nr_legacy_irqs(void)
#ifndef __ASSEMBLY__
+#include <linux/personality.h> /* For READ_IMPLIES_EXEC */
+
#ifndef CONFIG_MMU
#include <asm/page-nommu.h>
unsigned long arch_randomize_brk(struct mm_struct *mm)
{
- unsigned long range_end = mm->brk + 0x02000000;
- return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
+ return randomize_page(mm->brk, 0x02000000);
}
#ifdef CONFIG_MMU
static void raise_nmi(cpumask_t *mask)
{
- /*
- * Generate the backtrace directly if we are running in a calling
- * context that is not preemptible by the backtrace IPI. Note
- * that nmi_cpu_backtrace() automatically removes the current cpu
- * from mask.
- */
- if (cpumask_test_cpu(smp_processor_id(), mask) && irqs_disabled())
- nmi_cpu_backtrace(NULL);
-
smp_cross_call(mask, IPI_CPU_BACKTRACE);
}
-void arch_trigger_all_cpu_backtrace(bool include_self)
+void arch_trigger_cpumask_backtrace(const cpumask_t *mask, bool exclude_self)
{
- nmi_trigger_all_cpu_backtrace(include_self, raise_nmi);
+ nmi_trigger_cpumask_backtrace(mask, exclude_self, raise_nmi);
}
IRQENTRY_TEXT
TEXT_TEXT
SCHED_TEXT
+ CPUIDLE_TEXT
LOCK_TEXT
KPROBES_TEXT
*(.gnu.warning)
SOFTIRQENTRY_TEXT
TEXT_TEXT
SCHED_TEXT
+ CPUIDLE_TEXT
LOCK_TEXT
HYPERVISOR_TEXT
KPROBES_TEXT
unsigned long arch_randomize_brk(struct mm_struct *mm)
{
- unsigned long range_end = mm->brk;
-
if (is_compat_task())
- range_end += 0x02000000;
+ return randomize_page(mm->brk, 0x02000000);
else
- range_end += 0x40000000;
-
- return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
+ return randomize_page(mm->brk, 0x40000000);
}
ENTRY_TEXT
TEXT_TEXT
SCHED_TEXT
+ CPUIDLE_TEXT
LOCK_TEXT
KPROBES_TEXT
HYPERVISOR_TEXT
KPROBES_TEXT
TEXT_TEXT
SCHED_TEXT
+ CPUIDLE_TEXT
LOCK_TEXT
*(.fixup)
*(.gnu.warning)
#ifndef CONFIG_SCHEDULE_L1
SCHED_TEXT
#endif
+ CPUIDLE_TEXT
LOCK_TEXT
IRQENTRY_TEXT
SOFTIRQENTRY_TEXT
_stext = .;
TEXT_TEXT
SCHED_TEXT
+ CPUIDLE_TEXT
LOCK_TEXT
IRQENTRY_TEXT
SOFTIRQENTRY_TEXT
HEAD_TEXT
TEXT_TEXT
SCHED_TEXT
+ CPUIDLE_TEXT
LOCK_TEXT
*(.fixup)
*(.text.__*)
*(.text..tlbmiss)
TEXT_TEXT
SCHED_TEXT
+ CPUIDLE_TEXT
LOCK_TEXT
#ifdef CONFIG_DEBUG_INFO
INIT_TEXT
_stext = . ;
TEXT_TEXT
SCHED_TEXT
+ CPUIDLE_TEXT
LOCK_TEXT
#if defined(CONFIG_ROMKERNEL)
*(.int_redirect)
_text = .;
TEXT_TEXT
SCHED_TEXT
+ CPUIDLE_TEXT
LOCK_TEXT
KPROBES_TEXT
*(.fixup)
# The gate DSO image is built using a special linker script.
include $(src)/Makefile.gate
-# Calculate NR_IRQ = max(IA64_NATIVE_NR_IRQS, XEN_NR_IRQS, ...) based on config
-define sed-y
- "/^->/{s:^->\([^ ]*\) [\$$#]*\([^ ]*\) \(.*\):#define \1 \2 /* \3 */:; s:->::; p;}"
-endef
quiet_cmd_nr_irqs = GEN $@
define cmd_nr_irqs
(set -e; \
echo " * This file was generated by Kbuild"; \
echo " *"; \
echo " */"; \
- echo ""; \
- sed -ne $(sed-y) $<; \
- echo ""; \
+ sed -ne '/#define/{s/\$$//;s/#//2;s/$$/*\//p;}' $<; \
echo "#endif" ) > $@
endef
__end_ivt_text = .;
TEXT_TEXT
SCHED_TEXT
+ CPUIDLE_TEXT
LOCK_TEXT
KPROBES_TEXT
*(.gnu.linkonce.t*)
HEAD_TEXT
TEXT_TEXT
SCHED_TEXT
+ CPUIDLE_TEXT
LOCK_TEXT
*(.fixup)
*(.gnu.warning)
HEAD_TEXT
TEXT_TEXT
SCHED_TEXT
+ CPUIDLE_TEXT
LOCK_TEXT
*(.fixup)
. = ALIGN(16);
HEAD_TEXT
TEXT_TEXT
SCHED_TEXT
+ CPUIDLE_TEXT
LOCK_TEXT
*(.fixup)
*(.gnu.warning)
HEAD_TEXT
TEXT_TEXT
SCHED_TEXT
+ CPUIDLE_TEXT
LOCK_TEXT
*(.fixup)
*(.gnu.warning)
.text : {
TEXT_TEXT
SCHED_TEXT
+ CPUIDLE_TEXT
LOCK_TEXT
KPROBES_TEXT
IRQENTRY_TEXT
EXIT_TEXT
EXIT_CALL
SCHED_TEXT
+ CPUIDLE_TEXT
LOCK_TEXT
KPROBES_TEXT
IRQENTRY_TEXT
default_machine_crash_shutdown(regs);
}
+#ifdef CONFIG_SMP
+void octeon_crash_smp_send_stop(void)
+{
+ int cpu;
+
+ /* disable watchdogs */
+ for_each_online_cpu(cpu)
+ cvmx_write_csr(CVMX_CIU_WDOGX(cpu_logical_map(cpu)), 0);
+}
+#endif
+
#endif /* CONFIG_KEXEC */
#ifdef CONFIG_CAVIUM_RESERVE32
_machine_kexec_shutdown = octeon_shutdown;
_machine_crash_shutdown = octeon_crash_shutdown;
_machine_kexec_prepare = octeon_kexec_prepare;
+#ifdef CONFIG_SMP
+ _crash_smp_send_stop = octeon_crash_smp_send_stop;
+#endif
#endif
octeon_user_io_init();
extern int get_c0_fdc_int(void);
-void arch_trigger_all_cpu_backtrace(bool);
-#define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace
+void arch_trigger_cpumask_backtrace(const struct cpumask *mask,
+ bool exclude_self);
+#define arch_trigger_cpumask_backtrace arch_trigger_cpumask_backtrace
#endif /* _ASM_IRQ_H */
extern unsigned long secondary_kexec_args[4];
extern void (*relocated_kexec_smp_wait) (void *);
extern atomic_t kexec_ready_to_reboot;
+extern void (*_crash_smp_send_stop)(void);
#endif
#endif
unsigned long saved_trap_nr;
};
-extern int arch_uprobe_analyze_insn(struct arch_uprobe *aup,
- struct mm_struct *mm, unsigned long addr);
-extern int arch_uprobe_pre_xol(struct arch_uprobe *aup, struct pt_regs *regs);
-extern int arch_uprobe_post_xol(struct arch_uprobe *aup, struct pt_regs *regs);
-extern bool arch_uprobe_xol_was_trapped(struct task_struct *tsk);
-extern int arch_uprobe_exception_notify(struct notifier_block *self,
- unsigned long val, void *data);
-extern void arch_uprobe_abort_xol(struct arch_uprobe *aup,
- struct pt_regs *regs);
-extern unsigned long arch_uretprobe_hijack_return_addr(
- unsigned long trampoline_vaddr, struct pt_regs *regs);
-
#endif /* __ASM_UPROBES_H */
static void crash_kexec_prepare_cpus(void)
{
+ static int cpus_stopped;
unsigned int msecs;
+ unsigned int ncpus;
- unsigned int ncpus = num_online_cpus() - 1;/* Excluding the panic cpu */
+ if (cpus_stopped)
+ return;
+
+ ncpus = num_online_cpus() - 1;/* Excluding the panic cpu */
dump_send_ipi(crash_shutdown_secondary);
smp_wmb();
cpu_relax();
mdelay(1);
}
+
+ cpus_stopped = 1;
+}
+
+/* Override the weak function in kernel/panic.c */
+void crash_smp_send_stop(void)
+{
+ if (_crash_smp_send_stop)
+ _crash_smp_send_stop();
+
+ crash_kexec_prepare_cpus();
}
#else /* !defined(CONFIG_SMP) */
#ifdef CONFIG_SMP
void (*relocated_kexec_smp_wait) (void *);
atomic_t kexec_ready_to_reboot = ATOMIC_INIT(0);
+void (*_crash_smp_send_stop)(void) = NULL;
#endif
int
dump_stack();
}
-void arch_trigger_all_cpu_backtrace(bool include_self)
+void arch_trigger_cpumask_backtrace(const cpumask_t *mask, bool exclude_self)
{
- smp_call_function(arch_dump_stack, NULL, 1);
+ long this_cpu = get_cpu();
+
+ if (cpumask_test_cpu(this_cpu, mask) && !exclude_self)
+ dump_stack();
+
+ smp_call_function_many(mask, arch_dump_stack, NULL, 1);
+
+ put_cpu();
}
int mips_get_process_fp_mode(struct task_struct *task)
.text : {
TEXT_TEXT
SCHED_TEXT
+ CPUIDLE_TEXT
LOCK_TEXT
KPROBES_TEXT
IRQENTRY_TEXT
HEAD_TEXT
TEXT_TEXT
SCHED_TEXT
+ CPUIDLE_TEXT
LOCK_TEXT
KPROBES_TEXT
*(.fixup)
.text : {
TEXT_TEXT
SCHED_TEXT
+ CPUIDLE_TEXT
LOCK_TEXT
IRQENTRY_TEXT
SOFTIRQENTRY_TEXT
_stext = .;
TEXT_TEXT
SCHED_TEXT
+ CPUIDLE_TEXT
LOCK_TEXT
KPROBES_TEXT
IRQENTRY_TEXT
.text ALIGN(PAGE_SIZE) : {
TEXT_TEXT
SCHED_TEXT
+ CPUIDLE_TEXT
LOCK_TEXT
KPROBES_TEXT
IRQENTRY_TEXT
#else
#define memory_hotplug_max() memblock_end_of_DRAM()
#endif /* CONFIG_NEED_MULTIPLE_NODES */
+#ifdef CONFIG_FA_DUMP
+#define __HAVE_ARCH_RESERVED_KERNEL_PAGES
+#endif
#endif /* __KERNEL__ */
#endif /* _ASM_MMZONE_H_ */
return 1;
}
+unsigned long __init arch_reserved_kernel_pages(void)
+{
+ return memblock_reserved_size() / PAGE_SIZE;
+}
+
/* Look for fadump= cmdline option. */
static int __init early_fadump_param(char *p)
{
/* Handle failure */
if (unlikely(entry == DMA_ERROR_CODE)) {
- if (printk_ratelimit())
+ if (!(attrs & DMA_ATTR_NO_WARN) &&
+ printk_ratelimit())
dev_info(dev, "iommu_alloc failed, tbl %p "
"vaddr %lx npages %lu\n", tbl, vaddr,
npages);
mask >> tbl->it_page_shift, align,
attrs);
if (dma_handle == DMA_ERROR_CODE) {
- if (printk_ratelimit()) {
+ if (!(attrs & DMA_ATTR_NO_WARN) &&
+ printk_ratelimit()) {
dev_info(dev, "iommu_alloc failed, tbl %p "
"vaddr %p npages %d\n", tbl, vaddr,
npages);
/* careful! __ftr_alt_* sections need to be close to .text */
*(.text .fixup __ftr_alt_* .ref.text)
SCHED_TEXT
+ CPUIDLE_TEXT
LOCK_TEXT
KPROBES_TEXT
IRQENTRY_TEXT
struct arch_uprobe_task {
};
-int arch_uprobe_analyze_insn(struct arch_uprobe *aup, struct mm_struct *mm,
- unsigned long addr);
-int arch_uprobe_pre_xol(struct arch_uprobe *aup, struct pt_regs *regs);
-int arch_uprobe_post_xol(struct arch_uprobe *aup, struct pt_regs *regs);
-bool arch_uprobe_xol_was_trapped(struct task_struct *tsk);
-int arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val,
- void *data);
-void arch_uprobe_abort_xol(struct arch_uprobe *ap, struct pt_regs *regs);
-unsigned long arch_uretprobe_hijack_return_addr(unsigned long trampoline,
- struct pt_regs *regs);
#endif /* _ASM_UPROBES_H */
kgid_t kgid;
for (i = 0; i < group_info->ngroups; i++) {
- kgid = GROUP_AT(group_info, i);
+ kgid = group_info->gid[i];
group = (u16)from_kgid_munged(user_ns, kgid);
if (put_user(group, grouplist+i))
return -EFAULT;
if (!gid_valid(kgid))
return -EINVAL;
- GROUP_AT(group_info, i) = kgid;
+ group_info->gid[i] = kgid;
}
return 0;
HEAD_TEXT
TEXT_TEXT
SCHED_TEXT
+ CPUIDLE_TEXT
LOCK_TEXT
KPROBES_TEXT
IRQENTRY_TEXT
_text = .; /* Text and read-only data */
TEXT_TEXT
SCHED_TEXT
+ CPUIDLE_TEXT
LOCK_TEXT
KPROBES_TEXT
*(.text.*)
TEXT_TEXT
EXTRA_TEXT
SCHED_TEXT
+ CPUIDLE_TEXT
LOCK_TEXT
KPROBES_TEXT
IRQENTRY_TEXT
return retval;
}
-void arch_trigger_all_cpu_backtrace(bool);
-#define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace
+void arch_trigger_cpumask_backtrace(const struct cpumask *mask,
+ bool exclude_self);
+#define arch_trigger_cpumask_backtrace arch_trigger_cpumask_backtrace
extern void *hardirq_stack[NR_CPUS];
extern void *softirq_stack[NR_CPUS];
}
}
-void arch_trigger_all_cpu_backtrace(bool include_self)
+void arch_trigger_cpumask_backtrace(const cpumask_t *mask, bool exclude_self)
{
struct thread_info *tp = current_thread_info();
struct pt_regs *regs = get_irq_regs();
memset(global_cpu_snapshot, 0, sizeof(global_cpu_snapshot));
- if (include_self)
+ if (cpumask_test_cpu(this_cpu, mask) && !exclude_self)
__global_reg_self(tp, regs, this_cpu);
smp_fetch_global_regs();
- for_each_online_cpu(cpu) {
+ for_each_cpu(cpu, mask) {
struct global_reg_snapshot *gp;
- if (!include_self && cpu == this_cpu)
+ if (exclude_self && cpu == this_cpu)
continue;
gp = &global_cpu_snapshot[cpu].reg;
static void sysrq_handle_globreg(int key)
{
- arch_trigger_all_cpu_backtrace(true);
+ trigger_all_cpu_backtrace();
}
static struct sysrq_key_op sparc_globalreg_op = {
HEAD_TEXT
TEXT_TEXT
SCHED_TEXT
+ CPUIDLE_TEXT
LOCK_TEXT
KPROBES_TEXT
IRQENTRY_TEXT
void setup_irq_regs(void);
#ifdef __tilegx__
-void arch_trigger_all_cpu_backtrace(bool self);
-#define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace
+void arch_trigger_cpumask_backtrace(const struct cpumask *mask,
+ bool exclude_self);
+#define arch_trigger_cpumask_backtrace arch_trigger_cpumask_backtrace
#endif
#endif /* _ASM_TILE_IRQ_H */
* When interrupted at _cpu_idle_nap, we bump the PC forward 8, and
* as a result return to the function that called _cpu_idle().
*/
-STD_ENTRY(_cpu_idle)
+STD_ENTRY_SECTION(_cpu_idle, .cpuidle.text)
movei r1, 1
IRQ_ENABLE_LOAD(r2, r3)
mtspr INTERRUPT_CRITICAL_SECTION, r1
#include <linux/spinlock.h>
#include <linux/module.h>
#include <linux/atomic.h>
-#include <linux/interrupt.h>
#include <asm/processor.h>
#include <asm/pmc.h>
if (!perf_irq)
panic("Unexpected PERF_COUNT interrupt %d\n", fault);
- nmi_enter();
retval = perf_irq(regs, fault);
- nmi_exit();
return retval;
}
#include <linux/init.h>
#include <linux/mm.h>
#include <linux/compat.h>
-#include <linux/hardirq.h>
+#include <linux/nmi.h>
#include <linux/syscalls.h>
#include <linux/kernel.h>
#include <linux/tracehook.h>
tile_show_stack(&kbt);
}
-/* To ensure stack dump on tiles occurs one by one. */
-static DEFINE_SPINLOCK(backtrace_lock);
-/* To ensure no backtrace occurs before all of the stack dump are done. */
-static atomic_t backtrace_cpus;
-/* The cpu mask to avoid reentrance. */
-static struct cpumask backtrace_mask;
-
-void do_nmi_dump_stack(struct pt_regs *regs)
-{
- int is_idle = is_idle_task(current) && !in_interrupt();
- int cpu;
-
- nmi_enter();
- cpu = smp_processor_id();
- if (WARN_ON_ONCE(!cpumask_test_and_clear_cpu(cpu, &backtrace_mask)))
- goto done;
-
- spin_lock(&backtrace_lock);
- if (is_idle)
- pr_info("CPU: %d idle\n", cpu);
- else
- show_regs(regs);
- spin_unlock(&backtrace_lock);
- atomic_dec(&backtrace_cpus);
-done:
- nmi_exit();
-}
-
#ifdef __tilegx__
-void arch_trigger_all_cpu_backtrace(bool self)
+void nmi_raise_cpu_backtrace(struct cpumask *in_mask)
{
struct cpumask mask;
HV_Coord tile;
unsigned int timeout;
int cpu;
- int ongoing;
HV_NMI_Info info[NR_CPUS];
- ongoing = atomic_cmpxchg(&backtrace_cpus, 0, num_online_cpus() - 1);
- if (ongoing != 0) {
- pr_err("Trying to do all-cpu backtrace.\n");
- pr_err("But another all-cpu backtrace is ongoing (%d cpus left)\n",
- ongoing);
- if (self) {
- pr_err("Reporting the stack on this cpu only.\n");
- dump_stack();
- }
- return;
- }
-
- cpumask_copy(&mask, cpu_online_mask);
- cpumask_clear_cpu(smp_processor_id(), &mask);
- cpumask_copy(&backtrace_mask, &mask);
-
- /* Backtrace for myself first. */
- if (self)
- dump_stack();
-
/* Tentatively dump stack on remote tiles via NMI. */
timeout = 100;
+ cpumask_copy(&mask, in_mask);
while (!cpumask_empty(&mask) && timeout) {
for_each_cpu(cpu, &mask) {
tile.x = cpu_x(cpu);
}
mdelay(10);
+ touch_softlockup_watchdog();
timeout--;
}
- /* Warn about cpus stuck in ICS and decrement their counts here. */
+ /* Warn about cpus stuck in ICS. */
if (!cpumask_empty(&mask)) {
for_each_cpu(cpu, &mask) {
+
+ /* Clear the bit as if nmi_cpu_backtrace() ran. */
+ cpumask_clear_cpu(cpu, in_mask);
+
switch (info[cpu].result) {
case HV_NMI_RESULT_FAIL_ICS:
pr_warn("Skipping stack dump of cpu %d in ICS at pc %#llx\n",
cpu);
break;
case HV_ENOSYS:
- pr_warn("Hypervisor too old to allow remote stack dumps.\n");
- goto skip_for_each;
+ WARN_ONCE(1, "Hypervisor too old to allow remote stack dumps.\n");
+ break;
default: /* should not happen */
pr_warn("Skipping stack dump of cpu %d [%d,%#llx]\n",
cpu, info[cpu].result, info[cpu].pc);
break;
}
}
-skip_for_each:
- atomic_sub(cpumask_weight(&mask), &backtrace_cpus);
}
}
+
+void arch_trigger_cpumask_backtrace(const cpumask_t *mask, bool exclude_self)
+{
+ nmi_trigger_cpumask_backtrace(mask, exclude_self,
+ nmi_raise_cpu_backtrace);
+}
#endif /* __tilegx_ */
#include <linux/reboot.h>
#include <linux/uaccess.h>
#include <linux/ptrace.h>
+#include <linux/hardirq.h>
+#include <linux/nmi.h>
#include <asm/stack.h>
#include <asm/traps.h>
#include <asm/setup.h>
void do_nmi(struct pt_regs *regs, int fault_num, unsigned long reason)
{
+ nmi_enter();
switch (reason) {
+#ifdef arch_trigger_cpumask_backtrace
case TILE_NMI_DUMP_STACK:
- do_nmi_dump_stack(regs);
+ nmi_cpu_backtrace(regs);
break;
+#endif
default:
panic("Unexpected do_nmi type %ld", reason);
- return;
}
+ nmi_exit();
}
/* Deprecated function currently only used here. */
.text : AT (ADDR(.text) - LOAD_OFFSET) {
HEAD_TEXT
SCHED_TEXT
+ CPUIDLE_TEXT
LOCK_TEXT
KPROBES_TEXT
IRQENTRY_TEXT
unsigned long arch_randomize_brk(struct mm_struct *mm)
{
- unsigned long range_end = mm->brk + 0x02000000;
- return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
+ return randomize_page(mm->brk, 0x02000000);
}
echo " *"; \
echo " */"; \
echo ""; \
- sed -ne "/^->/{s:^->\([^ ]*\) [\$$#]*\([^ ]*\) \(.*\):#define \1 \2 /* \3 */:; s:->::; p;}"; \
+ sed -ne '/#define/{s/\$$//;s/#//2;s/$$/*\//p;}'; \
echo ""; )
endef
_stext = .;
TEXT_TEXT
SCHED_TEXT
+ CPUIDLE_TEXT
LOCK_TEXT
*(.fixup)
*(.stub .text.* .gnu.linkonce.t.*)
_stext = .;
TEXT_TEXT
SCHED_TEXT
+ CPUIDLE_TEXT
LOCK_TEXT
*(.fixup)
/* .gnu.warning sections are handled specially by elf32.em. */
unsigned long arch_randomize_brk(struct mm_struct *mm)
{
- unsigned long range_end = mm->brk + 0x02000000;
- return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
+ return randomize_page(mm->brk, 0x02000000);
}
/*
.text : { /* Real text segment */
TEXT_TEXT
SCHED_TEXT
+ CPUIDLE_TEXT
LOCK_TEXT
*(.fixup)
extern void init_ISA_irqs(void);
#ifdef CONFIG_X86_LOCAL_APIC
-void arch_trigger_all_cpu_backtrace(bool);
-#define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace
+void arch_trigger_cpumask_backtrace(const struct cpumask *mask,
+ bool exclude_self);
+#define arch_trigger_cpumask_backtrace arch_trigger_cpumask_backtrace
#endif
#endif /* _ASM_X86_IRQ_H */
#include <asm/processor-flags.h>
#ifndef __ASSEMBLY__
+
+/* Provide __cpuidle; we can't safely include <linux/cpu.h> */
+#define __cpuidle __attribute__((__section__(".cpuidle.text")))
+
/*
* Interrupt control:
*/
asm volatile("sti": : :"memory");
}
-static inline void native_safe_halt(void)
+static inline __cpuidle void native_safe_halt(void)
{
asm volatile("sti; hlt": : :"memory");
}
-static inline void native_halt(void)
+static inline __cpuidle void native_halt(void)
{
asm volatile("hlt": : :"memory");
}
* Used in the idle loop; sti takes one instruction cycle
* to complete:
*/
-static inline void arch_safe_halt(void)
+static inline __cpuidle void arch_safe_halt(void)
{
native_safe_halt();
}
* Used when interrupts are already enabled or to
* shutdown the processor:
*/
-static inline void halt(void)
+static inline __cpuidle void halt(void)
{
native_halt();
}
typedef void crash_vmclear_fn(void);
extern crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss;
+extern void kdump_nmi_shootdown_cpus(void);
#endif /* __ASSEMBLY__ */
void (*smp_cpus_done)(unsigned max_cpus);
void (*stop_other_cpus)(int wait);
+ void (*crash_stop_other_cpus)(void);
void (*smp_send_reschedule)(int cpu);
int (*cpu_up)(unsigned cpu, struct task_struct *tidle);
}
EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe);
-void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx)
+void __cpuidle acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx)
{
unsigned int cpu = smp_processor_id();
struct cstate_entry *percpu_entry;
}
#endif
-#ifdef arch_trigger_all_cpu_backtrace
+#ifdef arch_trigger_cpumask_backtrace
static void nmi_raise_cpu_backtrace(cpumask_t *mask)
{
apic->send_IPI_mask(mask, NMI_VECTOR);
}
-void arch_trigger_all_cpu_backtrace(bool include_self)
+void arch_trigger_cpumask_backtrace(const cpumask_t *mask, bool exclude_self)
{
- nmi_trigger_all_cpu_backtrace(include_self, nmi_raise_cpu_backtrace);
+ nmi_trigger_cpumask_backtrace(mask, exclude_self,
+ nmi_raise_cpu_backtrace);
}
-static int
-arch_trigger_all_cpu_backtrace_handler(unsigned int cmd, struct pt_regs *regs)
+static int nmi_cpu_backtrace_handler(unsigned int cmd, struct pt_regs *regs)
{
if (nmi_cpu_backtrace(regs))
return NMI_HANDLED;
return NMI_DONE;
}
-NOKPROBE_SYMBOL(arch_trigger_all_cpu_backtrace_handler);
+NOKPROBE_SYMBOL(nmi_cpu_backtrace_handler);
-static int __init register_trigger_all_cpu_backtrace(void)
+static int __init register_nmi_cpu_backtrace_handler(void)
{
- register_nmi_handler(NMI_LOCAL, arch_trigger_all_cpu_backtrace_handler,
+ register_nmi_handler(NMI_LOCAL, nmi_cpu_backtrace_handler,
0, "arch_bt");
return 0;
}
-early_initcall(register_trigger_all_cpu_backtrace);
+early_initcall(register_nmi_cpu_backtrace_handler);
#endif
disable_local_APIC();
}
-static void kdump_nmi_shootdown_cpus(void)
+void kdump_nmi_shootdown_cpus(void)
{
nmi_shootdown_cpus(kdump_nmi_callback);
disable_local_APIC();
}
+/* Override the weak function in kernel/panic.c */
+void crash_smp_send_stop(void)
+{
+ static int cpus_stopped;
+
+ if (cpus_stopped)
+ return;
+
+ if (smp_ops.crash_stop_other_cpus)
+ smp_ops.crash_stop_other_cpus();
+ else
+ smp_send_stop();
+
+ cpus_stopped = 1;
+}
+
#else
-static void kdump_nmi_shootdown_cpus(void)
+void crash_smp_send_stop(void)
{
/* There are no cpus to shootdown */
}
/* The kernel is broken so disable interrupts */
local_irq_disable();
- kdump_nmi_shootdown_cpus();
+ crash_smp_send_stop();
/*
* VMCLEAR VMCSs loaded on this cpu if needed.
nr_free_pages += end_pfn - start_pfn;
}
- set_dma_reserve(nr_pages - nr_free_pages);
+ set_memory_reserve(nr_pages - nr_free_pages, false);
#endif
}
#endif
vmcoreinfo_append_str("KERNELOFFSET=%lx\n",
kaslr_offset());
+ VMCOREINFO_PHYS_BASE(phys_base);
}
/* arch-dependent functionality related to kexec file-based syscall */
/*
* We use this if we don't have any better idle routine..
*/
-void default_idle(void)
+void __cpuidle default_idle(void)
{
trace_cpu_idle_rcuidle(1, smp_processor_id());
safe_halt();
* with interrupts enabled and no flags, which is backwards compatible with the
* original MWAIT implementation.
*/
-static void mwait_idle(void)
+static __cpuidle void mwait_idle(void)
{
if (!current_set_polling_and_test()) {
trace_cpu_idle_rcuidle(1, smp_processor_id());
unsigned long arch_randomize_brk(struct mm_struct *mm)
{
- unsigned long range_end = mm->brk + 0x02000000;
- return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
+ return randomize_page(mm->brk, 0x02000000);
}
/*
#include <asm/nmi.h>
#include <asm/mce.h>
#include <asm/trace/irq_vectors.h>
+#include <asm/kexec.h>
+
/*
* Some notes on x86 processor bugs affecting SMP operation:
*
.smp_cpus_done = native_smp_cpus_done,
.stop_other_cpus = native_stop_other_cpus,
+#if defined(CONFIG_KEXEC_CORE)
+ .crash_stop_other_cpus = kdump_nmi_shootdown_cpus,
+#endif
.smp_send_reschedule = native_smp_send_reschedule,
.cpu_up = native_cpu_up,
unsigned long *end)
{
if (!test_thread_flag(TIF_ADDR32) && (flags & MAP_32BIT)) {
- unsigned long new_begin;
/* This is usually used needed to map code in small
model, so it needs to be in the first 31bit. Limit
it to that. This means we need to move the
*begin = 0x40000000;
*end = 0x80000000;
if (current->flags & PF_RANDOMIZE) {
- new_begin = randomize_range(*begin, *begin + 0x02000000, 0);
- if (new_begin)
- *begin = new_begin;
+ *begin = randomize_page(*begin, 0x02000000);
}
} else {
*begin = current->mm->mmap_legacy_base;
_stext = .;
TEXT_TEXT
SCHED_TEXT
+ CPUIDLE_TEXT
LOCK_TEXT
KPROBES_TEXT
ENTRY_TEXT
#include <asm/mman.h>
#define DEFINE(sym, val) \
- asm volatile("\n->" #sym " %0 " #val : : "i" (val))
+ asm volatile ("#define " #sym " %0 /*" #val :: "i" (val))
#define BLANK() asm volatile("\n->" : : )
#endif
#define DEFINE(sym, val) \
- asm volatile("\n->" #sym " %0 " #val : : "i" (val))
+ asm volatile ("#define " #sym " %0 /*" #val :: "i" (val))
#define DEFINE_LONGS(sym, val) \
- asm volatile("\n->" #sym " %0 " #val : : "i" (val/sizeof(unsigned long)))
+ asm volatile ("#define " #sym " %0 /*" #val :: "i" (val / sizeof(unsigned long)))
void foo(void)
{
VMLINUX_SYMBOL(__sched_text_start) = .;
*(.sched.literal .sched.text)
VMLINUX_SYMBOL(__sched_text_end) = .;
+ VMLINUX_SYMBOL(__cpuidle_text_start) = .;
+ *(.cpuidle.literal .cpuidle.text)
+ VMLINUX_SYMBOL(__cpuidle_text_end) = .;
VMLINUX_SYMBOL(__lock_text_start) = .;
*(.spinlock.literal .spinlock.text)
VMLINUX_SYMBOL(__lock_text_end) = .;
char buf[BDEVNAME_SIZE];
/* Don't show non-partitionable removeable devices or empty devices */
- if (!get_capacity(sgp) || (!disk_max_parts(sgp) &&
+ if (!get_capacity(sgp) || (!(disk_max_parts(sgp) > 1) &&
(sgp->flags & GENHD_FL_REMOVABLE)))
return 0;
if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO)
#include <linux/sched.h> /* need_resched() */
#include <linux/tick.h>
#include <linux/cpuidle.h>
+#include <linux/cpu.h>
#include <acpi/processor.h>
/*
* Callers should disable interrupts before the call and enable
* interrupts after return.
*/
-static void acpi_safe_halt(void)
+static void __cpuidle acpi_safe_halt(void)
{
if (!tif_need_resched()) {
safe_halt();
*
* Caller disables interrupt before call and enables interrupt after return.
*/
-static void acpi_idle_do_entry(struct acpi_processor_cx *cx)
+static void __cpuidle acpi_idle_do_entry(struct acpi_processor_cx *cx)
{
if (cx->entry_method == ACPI_CSTATE_FFH) {
/* Call into architectural FFH based C-state */
err:
unlock_device_hotplug();
- if (ret)
+ if (ret < 0)
return ret;
+ if (ret)
+ return -EINVAL;
+
return count;
}
}
EXPORT_SYMBOL(get_random_long);
-/*
- * randomize_range() returns a start address such that
+/**
+ * randomize_page - Generate a random, page aligned address
+ * @start: The smallest acceptable address the caller will take.
+ * @range: The size of the area, starting at @start, within which the
+ * random address must fall.
+ *
+ * If @start + @range would overflow, @range is capped.
*
- * [...... <range> .....]
- * start end
+ * NOTE: Historical use of randomize_range, which this replaces, presumed that
+ * @start was already page aligned. We now align it regardless.
*
- * a <range> with size "len" starting at the return value is inside in the
- * area defined by [start, end], but is otherwise randomized.
+ * Return: A page aligned address within [start, start + range). On error,
+ * @start is returned.
*/
unsigned long
-randomize_range(unsigned long start, unsigned long end, unsigned long len)
+randomize_page(unsigned long start, unsigned long range)
{
- unsigned long range = end - len - start;
+ if (!PAGE_ALIGNED(start)) {
+ range -= PAGE_ALIGN(start) - start;
+ start = PAGE_ALIGN(start);
+ }
- if (end <= start + len)
- return 0;
- return PAGE_ALIGN(get_random_int() % range + start);
+ if (start > ULONG_MAX - range)
+ range = ULONG_MAX - start;
+
+ range >>= PAGE_SHIFT;
+
+ if (range == 0)
+ return start;
+
+ return start + (get_random_long() % range << PAGE_SHIFT);
}
/* Interface for in-kernel drivers of true hardware RNGs.
#include <linux/cpuidle.h>
#include <linux/cpumask.h>
#include <linux/tick.h>
+#include <linux/cpu.h>
#include "cpuidle.h"
}
#ifdef CONFIG_ARCH_HAS_CPU_RELAX
-static int poll_idle(struct cpuidle_device *dev,
- struct cpuidle_driver *drv, int index)
+static int __cpuidle poll_idle(struct cpuidle_device *dev,
+ struct cpuidle_driver *drv, int index)
{
local_irq_enable();
if (!current_set_polling_and_test()) {
*
* Must be called under local_irq_disable().
*/
-static int intel_idle(struct cpuidle_device *dev,
- struct cpuidle_driver *drv, int index)
+static __cpuidle int intel_idle(struct cpuidle_device *dev,
+ struct cpuidle_driver *drv, int index)
{
unsigned long ecx = 1; /* break on interrupt flag */
struct cpuidle_state *state = &drv->states[index];
goto out;
ret = BLK_MQ_RQ_QUEUE_BUSY;
- if (!dma_map_sg(dev->dev, iod->sg, iod->nents, dma_dir))
+ if (!dma_map_sg_attrs(dev->dev, iod->sg, iod->nents, dma_dir,
+ DMA_ATTR_NO_WARN))
goto out;
if (!nvme_setup_prps(dev, req, size))
name = of_get_property(of_aliases, "stdout", NULL);
if (name)
of_stdout = of_find_node_opts_by_path(name, &of_stdout_options);
+ if (of_stdout)
+ console_set_by_of();
}
if (!of_aliases)
{
struct rio_cm_msg msg;
void *buf;
- int ret = 0;
+ int ret;
if (copy_from_user(&msg, arg, sizeof(msg)))
return -EFAULT;
if (msg.size > RIO_MAX_MSG_SIZE)
return -EINVAL;
- buf = kmalloc(msg.size, GFP_KERNEL);
- if (!buf)
- return -ENOMEM;
-
- if (copy_from_user(buf, (void __user *)(uintptr_t)msg.msg, msg.size)) {
- ret = -EFAULT;
- goto out;
- }
+ buf = memdup_user((void __user *)(uintptr_t)msg.msg, msg.size);
+ if (IS_ERR(buf))
+ return PTR_ERR(buf);
ret = riocm_ch_send(msg.ch_num, buf, msg.size);
-out:
+
kfree(buf);
return ret;
}
task_lock(current);
if (pud->pud_ngroups > current_ngroups)
pud->pud_ngroups = current_ngroups;
- memcpy(pud->pud_groups, current_cred()->group_info->blocks[0],
+ memcpy(pud->pud_groups, current_cred()->group_info->gid,
pud->pud_ngroups * sizeof(__u32));
task_unlock(current);
#define AUTOFS_IOC_COUNT 32
#define AUTOFS_DEV_IOCTL_IOC_FIRST (AUTOFS_DEV_IOCTL_VERSION)
-#define AUTOFS_DEV_IOCTL_IOC_COUNT (AUTOFS_IOC_COUNT - 11)
+#define AUTOFS_DEV_IOCTL_IOC_COUNT \
+ (AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD - AUTOFS_DEV_IOCTL_VERSION_CMD)
#include <linux/kernel.h>
#include <linux/slab.h>
#include <asm/current.h>
#include <linux/uaccess.h>
-/* #define DEBUG */
-
#ifdef pr_fmt
#undef pr_fmt
#endif
int max_proto;
unsigned long exp_timeout;
unsigned int type;
- int reghost_enabled;
- int needs_reghost;
struct super_block *sb;
struct mutex wq_mutex;
struct mutex pipe_mutex;
}
}
-extern void autofs4_kill_sb(struct super_block *);
+void autofs4_kill_sb(struct super_block *);
if ((param->ver_major != AUTOFS_DEV_IOCTL_VERSION_MAJOR) ||
(param->ver_minor > AUTOFS_DEV_IOCTL_VERSION_MINOR)) {
pr_warn("ioctl control interface version mismatch: "
- "kernel(%u.%u), user(%u.%u), cmd(%d)\n",
+ "kernel(%u.%u), user(%u.%u), cmd(0x%08x)\n",
AUTOFS_DEV_IOCTL_VERSION_MAJOR,
AUTOFS_DEV_IOCTL_VERSION_MINOR,
param->ver_major, param->ver_minor, cmd);
return sbi;
}
+/* Return autofs dev ioctl version */
+static int autofs_dev_ioctl_version(struct file *fp,
+ struct autofs_sb_info *sbi,
+ struct autofs_dev_ioctl *param)
+{
+ /* This should have already been set. */
+ param->ver_major = AUTOFS_DEV_IOCTL_VERSION_MAJOR;
+ param->ver_minor = AUTOFS_DEV_IOCTL_VERSION_MINOR;
+ return 0;
+}
+
/* Return autofs module protocol version */
static int autofs_dev_ioctl_protover(struct file *fp,
struct autofs_sb_info *sbi,
static ioctl_fn lookup_dev_ioctl(unsigned int cmd)
{
- static struct {
- int cmd;
- ioctl_fn fn;
- } _ioctls[] = {
- {cmd_idx(AUTOFS_DEV_IOCTL_VERSION_CMD), NULL},
- {cmd_idx(AUTOFS_DEV_IOCTL_PROTOVER_CMD),
- autofs_dev_ioctl_protover},
- {cmd_idx(AUTOFS_DEV_IOCTL_PROTOSUBVER_CMD),
- autofs_dev_ioctl_protosubver},
- {cmd_idx(AUTOFS_DEV_IOCTL_OPENMOUNT_CMD),
- autofs_dev_ioctl_openmount},
- {cmd_idx(AUTOFS_DEV_IOCTL_CLOSEMOUNT_CMD),
- autofs_dev_ioctl_closemount},
- {cmd_idx(AUTOFS_DEV_IOCTL_READY_CMD),
- autofs_dev_ioctl_ready},
- {cmd_idx(AUTOFS_DEV_IOCTL_FAIL_CMD),
- autofs_dev_ioctl_fail},
- {cmd_idx(AUTOFS_DEV_IOCTL_SETPIPEFD_CMD),
- autofs_dev_ioctl_setpipefd},
- {cmd_idx(AUTOFS_DEV_IOCTL_CATATONIC_CMD),
- autofs_dev_ioctl_catatonic},
- {cmd_idx(AUTOFS_DEV_IOCTL_TIMEOUT_CMD),
- autofs_dev_ioctl_timeout},
- {cmd_idx(AUTOFS_DEV_IOCTL_REQUESTER_CMD),
- autofs_dev_ioctl_requester},
- {cmd_idx(AUTOFS_DEV_IOCTL_EXPIRE_CMD),
- autofs_dev_ioctl_expire},
- {cmd_idx(AUTOFS_DEV_IOCTL_ASKUMOUNT_CMD),
- autofs_dev_ioctl_askumount},
- {cmd_idx(AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD),
- autofs_dev_ioctl_ismountpoint}
+ static ioctl_fn _ioctls[] = {
+ autofs_dev_ioctl_version,
+ autofs_dev_ioctl_protover,
+ autofs_dev_ioctl_protosubver,
+ autofs_dev_ioctl_openmount,
+ autofs_dev_ioctl_closemount,
+ autofs_dev_ioctl_ready,
+ autofs_dev_ioctl_fail,
+ autofs_dev_ioctl_setpipefd,
+ autofs_dev_ioctl_catatonic,
+ autofs_dev_ioctl_timeout,
+ autofs_dev_ioctl_requester,
+ autofs_dev_ioctl_expire,
+ autofs_dev_ioctl_askumount,
+ autofs_dev_ioctl_ismountpoint,
};
unsigned int idx = cmd_idx(cmd);
- return (idx >= ARRAY_SIZE(_ioctls)) ? NULL : _ioctls[idx].fn;
+ return (idx >= ARRAY_SIZE(_ioctls)) ? NULL : _ioctls[idx];
}
/* ioctl dispatcher */
cmd = _IOC_NR(command);
if (_IOC_TYPE(command) != _IOC_TYPE(AUTOFS_DEV_IOCTL_IOC_FIRST) ||
- cmd - cmd_first >= AUTOFS_DEV_IOCTL_IOC_COUNT) {
+ cmd - cmd_first > AUTOFS_DEV_IOCTL_IOC_COUNT) {
return -ENOTTY;
}
if (err)
goto out;
- /* The validate routine above always sets the version */
- if (cmd == AUTOFS_DEV_IOCTL_VERSION_CMD)
- goto done;
-
fn = lookup_dev_ioctl(cmd);
if (!fn) {
pr_warn("unknown command 0x%08x\n", command);
- return -ENOTTY;
+ err = -ENOTTY;
+ goto out;
}
fp = NULL;
/*
* For obvious reasons the openmount can't have a file
* descriptor yet. We don't take a reference to the
- * file during close to allow for immediate release.
+ * file during close to allow for immediate release,
+ * and the same for retrieving ioctl version.
*/
- if (cmd != AUTOFS_DEV_IOCTL_OPENMOUNT_CMD &&
+ if (cmd != AUTOFS_DEV_IOCTL_VERSION_CMD &&
+ cmd != AUTOFS_DEV_IOCTL_OPENMOUNT_CMD &&
cmd != AUTOFS_DEV_IOCTL_CLOSEMOUNT_CMD) {
fp = fget(param->ioctlfd);
if (!fp) {
if (fp)
fput(fp);
-done:
if (err >= 0 && copy_to_user(user, param, AUTOFS_DEV_IOCTL_SIZE))
err = -EFAULT;
out:
goto fail_dput;
}
+ /* Test versions first */
+ if (sbi->max_proto < AUTOFS_MIN_PROTO_VERSION ||
+ sbi->min_proto > AUTOFS_MAX_PROTO_VERSION) {
+ pr_err("kernel does not match daemon version "
+ "daemon (%d, %d) kernel (%d, %d)\n",
+ sbi->min_proto, sbi->max_proto,
+ AUTOFS_MIN_PROTO_VERSION, AUTOFS_MAX_PROTO_VERSION);
+ goto fail_dput;
+ }
+
+ /* Establish highest kernel protocol version */
+ if (sbi->max_proto > AUTOFS_MAX_PROTO_VERSION)
+ sbi->version = AUTOFS_MAX_PROTO_VERSION;
+ else
+ sbi->version = sbi->max_proto;
+ sbi->sub_version = AUTOFS_PROTO_SUBVERSION;
+
if (pgrp_set) {
sbi->oz_pgrp = find_get_pid(pgrp);
if (!sbi->oz_pgrp) {
root_inode->i_fop = &autofs4_root_operations;
root_inode->i_op = &autofs4_dir_inode_operations;
- /* Couldn't this be tested earlier? */
- if (sbi->max_proto < AUTOFS_MIN_PROTO_VERSION ||
- sbi->min_proto > AUTOFS_MAX_PROTO_VERSION) {
- pr_err("kernel does not match daemon version "
- "daemon (%d, %d) kernel (%d, %d)\n",
- sbi->min_proto, sbi->max_proto,
- AUTOFS_MIN_PROTO_VERSION, AUTOFS_MAX_PROTO_VERSION);
- goto fail_dput;
- }
-
- /* Establish highest kernel protocol version */
- if (sbi->max_proto > AUTOFS_MAX_PROTO_VERSION)
- sbi->version = AUTOFS_MAX_PROTO_VERSION;
- else
- sbi->version = sbi->max_proto;
- sbi->sub_version = AUTOFS_PROTO_SUBVERSION;
-
pr_debug("pipe fd = %d, pgrp = %u\n", pipefd, pid_nr(sbi->oz_pgrp));
pipe = fget(pipefd);
if (!pipe) {
pr_err("could not open pipe file descriptor\n");
- goto fail_dput;
+ goto fail_put_pid;
}
ret = autofs_prepare_pipe(pipe);
if (ret < 0)
fail_fput:
pr_err("pipe file descriptor does not contain proper ops\n");
fput(pipe);
- /* fall through */
+fail_put_pid:
+ put_pid(sbi->oz_pgrp);
fail_dput:
dput(root);
goto fail_free;
fail_ino:
- kfree(ino);
+ autofs4_free_ino(ino);
fail_free:
- put_pid(sbi->oz_pgrp);
kfree(sbi);
s->s_fs_info = NULL;
return ret;
inode->i_fop = &autofs4_dir_operations;
} else if (S_ISLNK(mode)) {
inode->i_op = &autofs4_symlink_inode_operations;
- }
+ } else
+ WARN_ON(1);
return inode;
}
inode = autofs4_get_inode(dir->i_sb, S_IFLNK | 0555);
if (!inode) {
kfree(cp);
- if (!dentry->d_fsdata)
- kfree(ino);
return -ENOMEM;
}
inode->i_private = cp;
if (may_umount(mnt))
status = 1;
- pr_debug("returning %d\n", status);
+ pr_debug("may umount %d\n", status);
status = put_user(status, p);
#include <asm/ioctls.h>
#include "internal.h"
-int compat_log = 1;
-
-int compat_printk(const char *fmt, ...)
-{
- va_list ap;
- int ret;
- if (!compat_log)
- return 0;
- va_start(ap, fmt);
- ret = vprintk(fmt, ap);
- va_end(ap);
- return ret;
-}
-
/*
* Not all architectures have sys_utime, so implement this in terms
* of sys_utimes.
if (!write && !buffer_mapped(&bh)) {
spinlock_t *ptl;
pmd_t entry;
- struct page *zero_page = get_huge_zero_page();
+ struct page *zero_page = mm_get_huge_zero_page(vma->vm_mm);
if (unlikely(!zero_page)) {
dax_pmd_dbg(&bh, address, "no zero page");
.open = dquot_file_open,
.release = ext2_release_file,
.fsync = ext2_fsync,
+ .get_unmapped_area = thp_get_unmapped_area,
.splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
};
.open = ext4_file_open,
.release = ext4_release_file,
.fsync = ext4_sync_file,
+ .get_unmapped_area = thp_get_unmapped_area,
.splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
.fallocate = ext4_fallocate,
loff_t i_size = i_size_read(page_file_mapping(page)->host);
if (i_size > 0) {
- pgoff_t page_index = page_file_index(page);
+ pgoff_t index = page_index(page);
pgoff_t end_index = (i_size - 1) >> PAGE_SHIFT;
- if (page_index < end_index)
+ if (index < end_index)
return PAGE_SIZE;
- if (page_index == end_index)
+ if (index == end_index)
return ((i_size - 1) & ~PAGE_MASK) + 1;
}
return 0;
* update_nfs_request below if the region is not locked. */
req->wb_page = page;
if (page) {
- req->wb_index = page_file_index(page);
+ req->wb_index = page_index(page);
get_page(page);
}
req->wb_offset = offset;
int error;
dprintk("NFS: nfs_readpage (%p %ld@%lu)\n",
- page, PAGE_SIZE, page_file_index(page));
+ page, PAGE_SIZE, page_index(page));
nfs_inc_stats(inode, NFSIOS_VFSREADPAGE);
nfs_add_stats(inode, NFSIOS_READPAGES, 1);
spin_lock(&inode->i_lock);
i_size = i_size_read(inode);
end_index = (i_size - 1) >> PAGE_SHIFT;
- if (i_size > 0 && page_file_index(page) < end_index)
+ if (i_size > 0 && page_index(page) < end_index)
goto out;
end = page_file_offset(page) + ((loff_t)offset+count);
if (i_size >= end)
{
int ret;
- nfs_pageio_cond_complete(pgio, page_file_index(page));
+ nfs_pageio_cond_complete(pgio, page_index(page));
ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE,
launder);
if (ret == -EAGAIN) {
goto oom;
for (i = 0; i < rqgi->ngroups; i++) {
- if (gid_eq(GLOBAL_ROOT_GID, GROUP_AT(rqgi, i)))
- GROUP_AT(gi, i) = exp->ex_anon_gid;
+ if (gid_eq(GLOBAL_ROOT_GID, rqgi->gid[i]))
+ gi->gid[i] = exp->ex_anon_gid;
else
- GROUP_AT(gi, i) = GROUP_AT(rqgi, i);
+ gi->gid[i] = rqgi->gid[i];
}
} else {
gi = get_group_info(rqgi);
if (g1->ngroups != g2->ngroups)
return false;
for (i=0; i<g1->ngroups; i++)
- if (!gid_eq(GROUP_AT(g1, i), GROUP_AT(g2, i)))
+ if (!gid_eq(g1->gid[i], g2->gid[i]))
return false;
return true;
}
BUG_ON(o2net_listen_sock != NULL);
mlog(ML_KTHREAD, "starting o2net thread...\n");
- o2net_wq = create_singlethread_workqueue("o2net");
+ o2net_wq = alloc_ordered_workqueue("o2net", WQ_MEM_RECLAIM);
if (o2net_wq == NULL) {
mlog(ML_ERROR, "unable to launch o2net thread\n");
return -ENOMEM; /* ? */
struct dlm_lock *lock, int flags, int type)
{
enum dlm_status status;
- u8 old_owner = res->owner;
mlog(0, "type=%d, convert_type=%d, busy=%d\n", lock->ml.type,
lock->ml.convert_type, res->state & DLM_LOCK_RES_IN_PROGRESS);
spin_lock(&res->spinlock);
res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
- lock->convert_pending = 0;
/* if it failed, move it back to granted queue.
* if master returns DLM_NORMAL and then down before sending ast,
* it may have already been moved to granted queue, reset to
if (status != DLM_NOTQUEUED)
dlm_error(status);
dlm_revert_pending_convert(res, lock);
- } else if ((res->state & DLM_LOCK_RES_RECOVERING) ||
- (old_owner != res->owner)) {
- mlog(0, "res %.*s is in recovering or has been recovered.\n",
- res->lockname.len, res->lockname.name);
+ } else if (!lock->convert_pending) {
+ mlog(0, "%s: res %.*s, owner died and lock has been moved back "
+ "to granted list, retry convert.\n",
+ dlm->name, res->lockname.len, res->lockname.name);
status = DLM_RECOVERING;
}
+
+ lock->convert_pending = 0;
bail:
spin_unlock(&res->spinlock);
}
snprintf(wq_name, O2NM_MAX_NAME_LEN, "dlm_wq-%s", dlm->name);
- dlm->dlm_worker = create_singlethread_workqueue(wq_name);
+ dlm->dlm_worker = alloc_workqueue(wq_name, WQ_MEM_RECLAIM, 0);
if (!dlm->dlm_worker) {
status = -ENOMEM;
mlog_errno(status);
}
cleanup_inode = 1;
- user_dlm_worker = create_singlethread_workqueue("user_dlm");
+ user_dlm_worker = alloc_workqueue("user_dlm", WQ_MEM_RECLAIM, 0);
if (!user_dlm_worker) {
status = -ENOMEM;
goto bail;
}
cleancache_init_shared_fs(sb);
- osb->ocfs2_wq = create_singlethread_workqueue("ocfs2_wq");
+ osb->ocfs2_wq = alloc_ordered_workqueue("ocfs2_wq", WQ_MEM_RECLAIM);
if (!osb->ocfs2_wq) {
status = -ENOMEM;
mlog_errno(status);
return retval;
}
-static void account_pipe_buffers(struct pipe_inode_info *pipe,
+static unsigned long account_pipe_buffers(struct user_struct *user,
unsigned long old, unsigned long new)
{
- atomic_long_add(new - old, &pipe->user->pipe_bufs);
+ return atomic_long_add_return(new - old, &user->pipe_bufs);
}
-static bool too_many_pipe_buffers_soft(struct user_struct *user)
+static bool too_many_pipe_buffers_soft(unsigned long user_bufs)
{
- return pipe_user_pages_soft &&
- atomic_long_read(&user->pipe_bufs) >= pipe_user_pages_soft;
+ return pipe_user_pages_soft && user_bufs >= pipe_user_pages_soft;
}
-static bool too_many_pipe_buffers_hard(struct user_struct *user)
+static bool too_many_pipe_buffers_hard(unsigned long user_bufs)
{
- return pipe_user_pages_hard &&
- atomic_long_read(&user->pipe_bufs) >= pipe_user_pages_hard;
+ return pipe_user_pages_hard && user_bufs >= pipe_user_pages_hard;
}
struct pipe_inode_info *alloc_pipe_info(void)
{
struct pipe_inode_info *pipe;
+ unsigned long pipe_bufs = PIPE_DEF_BUFFERS;
+ struct user_struct *user = get_current_user();
+ unsigned long user_bufs;
pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL_ACCOUNT);
- if (pipe) {
- unsigned long pipe_bufs = PIPE_DEF_BUFFERS;
- struct user_struct *user = get_current_user();
-
- if (!too_many_pipe_buffers_hard(user)) {
- if (too_many_pipe_buffers_soft(user))
- pipe_bufs = 1;
- pipe->bufs = kcalloc(pipe_bufs,
- sizeof(struct pipe_buffer),
- GFP_KERNEL_ACCOUNT);
- }
+ if (pipe == NULL)
+ goto out_free_uid;
- if (pipe->bufs) {
- init_waitqueue_head(&pipe->wait);
- pipe->r_counter = pipe->w_counter = 1;
- pipe->buffers = pipe_bufs;
- pipe->user = user;
- account_pipe_buffers(pipe, 0, pipe_bufs);
- mutex_init(&pipe->mutex);
- return pipe;
- }
- free_uid(user);
- kfree(pipe);
+ if (pipe_bufs * PAGE_SIZE > pipe_max_size && !capable(CAP_SYS_RESOURCE))
+ pipe_bufs = pipe_max_size >> PAGE_SHIFT;
+
+ user_bufs = account_pipe_buffers(user, 0, pipe_bufs);
+
+ if (too_many_pipe_buffers_soft(user_bufs)) {
+ user_bufs = account_pipe_buffers(user, pipe_bufs, 1);
+ pipe_bufs = 1;
+ }
+
+ if (too_many_pipe_buffers_hard(user_bufs))
+ goto out_revert_acct;
+
+ pipe->bufs = kcalloc(pipe_bufs, sizeof(struct pipe_buffer),
+ GFP_KERNEL_ACCOUNT);
+
+ if (pipe->bufs) {
+ init_waitqueue_head(&pipe->wait);
+ pipe->r_counter = pipe->w_counter = 1;
+ pipe->buffers = pipe_bufs;
+ pipe->user = user;
+ mutex_init(&pipe->mutex);
+ return pipe;
}
+out_revert_acct:
+ (void) account_pipe_buffers(user, pipe_bufs, 0);
+ kfree(pipe);
+out_free_uid:
+ free_uid(user);
return NULL;
}
{
int i;
- account_pipe_buffers(pipe, pipe->buffers, 0);
+ (void) account_pipe_buffers(pipe->user, pipe->buffers, 0);
free_uid(pipe->user);
for (i = 0; i < pipe->buffers; i++) {
struct pipe_buffer *buf = pipe->bufs + i;
.fasync = pipe_fasync,
};
+/*
+ * Currently we rely on the pipe array holding a power-of-2 number
+ * of pages.
+ */
+static inline unsigned int round_pipe_size(unsigned int size)
+{
+ unsigned long nr_pages;
+
+ nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ return roundup_pow_of_two(nr_pages) << PAGE_SHIFT;
+}
+
/*
* Allocate a new array of pipe buffers and copy the info over. Returns the
* pipe size if successful, or return -ERROR on error.
*/
-static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages)
+static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
{
struct pipe_buffer *bufs;
+ unsigned int size, nr_pages;
+ unsigned long user_bufs;
+ long ret = 0;
+
+ size = round_pipe_size(arg);
+ nr_pages = size >> PAGE_SHIFT;
+
+ if (!nr_pages)
+ return -EINVAL;
+
+ /*
+ * If trying to increase the pipe capacity, check that an
+ * unprivileged user is not trying to exceed various limits
+ * (soft limit check here, hard limit check just below).
+ * Decreasing the pipe capacity is always permitted, even
+ * if the user is currently over a limit.
+ */
+ if (nr_pages > pipe->buffers &&
+ size > pipe_max_size && !capable(CAP_SYS_RESOURCE))
+ return -EPERM;
+
+ user_bufs = account_pipe_buffers(pipe->user, pipe->buffers, nr_pages);
+
+ if (nr_pages > pipe->buffers &&
+ (too_many_pipe_buffers_hard(user_bufs) ||
+ too_many_pipe_buffers_soft(user_bufs)) &&
+ !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) {
+ ret = -EPERM;
+ goto out_revert_acct;
+ }
/*
* We can shrink the pipe, if arg >= pipe->nrbufs. Since we don't
* again like we would do for growing. If the pipe currently
* contains more buffers than arg, then return busy.
*/
- if (nr_pages < pipe->nrbufs)
- return -EBUSY;
+ if (nr_pages < pipe->nrbufs) {
+ ret = -EBUSY;
+ goto out_revert_acct;
+ }
bufs = kcalloc(nr_pages, sizeof(*bufs),
GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
- if (unlikely(!bufs))
- return -ENOMEM;
+ if (unlikely(!bufs)) {
+ ret = -ENOMEM;
+ goto out_revert_acct;
+ }
/*
* The pipe array wraps around, so just start the new one at zero
memcpy(bufs + head, pipe->bufs, tail * sizeof(struct pipe_buffer));
}
- account_pipe_buffers(pipe, pipe->buffers, nr_pages);
pipe->curbuf = 0;
kfree(pipe->bufs);
pipe->bufs = bufs;
pipe->buffers = nr_pages;
return nr_pages * PAGE_SIZE;
-}
-
-/*
- * Currently we rely on the pipe array holding a power-of-2 number
- * of pages.
- */
-static inline unsigned int round_pipe_size(unsigned int size)
-{
- unsigned long nr_pages;
- nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
- return roundup_pow_of_two(nr_pages) << PAGE_SHIFT;
+out_revert_acct:
+ (void) account_pipe_buffers(pipe->user, nr_pages, pipe->buffers);
+ return ret;
}
/*
__pipe_lock(pipe);
switch (cmd) {
- case F_SETPIPE_SZ: {
- unsigned int size, nr_pages;
-
- size = round_pipe_size(arg);
- nr_pages = size >> PAGE_SHIFT;
-
- ret = -EINVAL;
- if (!nr_pages)
- goto out;
-
- if (!capable(CAP_SYS_RESOURCE) && size > pipe_max_size) {
- ret = -EPERM;
- goto out;
- } else if ((too_many_pipe_buffers_hard(pipe->user) ||
- too_many_pipe_buffers_soft(pipe->user)) &&
- !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) {
- ret = -EPERM;
- goto out;
- }
- ret = pipe_set_size(pipe, nr_pages);
+ case F_SETPIPE_SZ:
+ ret = pipe_set_size(pipe, arg);
break;
- }
case F_GETPIPE_SZ:
ret = pipe->buffers * PAGE_SIZE;
break;
break;
}
-out:
__pipe_unlock(pipe);
return ret;
}
task_unlock(p);
rcu_read_unlock();
- seq_printf(m,
- "State:\t%s\n"
- "Tgid:\t%d\n"
- "Ngid:\t%d\n"
- "Pid:\t%d\n"
- "PPid:\t%d\n"
- "TracerPid:\t%d\n"
- "Uid:\t%d\t%d\t%d\t%d\n"
- "Gid:\t%d\t%d\t%d\t%d\n"
- "FDSize:\t%d\nGroups:\t",
- get_task_state(p),
- tgid, ngid, pid_nr_ns(pid, ns), ppid, tpid,
- from_kuid_munged(user_ns, cred->uid),
- from_kuid_munged(user_ns, cred->euid),
- from_kuid_munged(user_ns, cred->suid),
- from_kuid_munged(user_ns, cred->fsuid),
- from_kgid_munged(user_ns, cred->gid),
- from_kgid_munged(user_ns, cred->egid),
- from_kgid_munged(user_ns, cred->sgid),
- from_kgid_munged(user_ns, cred->fsgid),
- max_fds);
-
+ seq_printf(m, "State:\t%s", get_task_state(p));
+
+ seq_put_decimal_ull(m, "\nTgid:\t", tgid);
+ seq_put_decimal_ull(m, "\nNgid:\t", ngid);
+ seq_put_decimal_ull(m, "\nPid:\t", pid_nr_ns(pid, ns));
+ seq_put_decimal_ull(m, "\nPPid:\t", ppid);
+ seq_put_decimal_ull(m, "\nTracerPid:\t", tpid);
+ seq_put_decimal_ull(m, "\nUid:\t", from_kuid_munged(user_ns, cred->uid));
+ seq_put_decimal_ull(m, "\t", from_kuid_munged(user_ns, cred->euid));
+ seq_put_decimal_ull(m, "\t", from_kuid_munged(user_ns, cred->suid));
+ seq_put_decimal_ull(m, "\t", from_kuid_munged(user_ns, cred->fsuid));
+ seq_put_decimal_ull(m, "\nGid:\t", from_kgid_munged(user_ns, cred->gid));
+ seq_put_decimal_ull(m, "\t", from_kgid_munged(user_ns, cred->egid));
+ seq_put_decimal_ull(m, "\t", from_kgid_munged(user_ns, cred->sgid));
+ seq_put_decimal_ull(m, "\t", from_kgid_munged(user_ns, cred->fsgid));
+ seq_put_decimal_ull(m, "\nFDSize:\t", max_fds);
+
+ seq_puts(m, "\nGroups:\t");
group_info = cred->group_info;
for (g = 0; g < group_info->ngroups; g++)
- seq_printf(m, "%d ",
- from_kgid_munged(user_ns, GROUP_AT(group_info, g)));
+ seq_put_decimal_ull(m, g ? " " : "",
+ from_kgid_munged(user_ns, group_info->gid[g]));
put_cred(cred);
+ /* Trailing space shouldn't have been added in the first place. */
+ seq_putc(m, ' ');
#ifdef CONFIG_PID_NS
seq_puts(m, "\nNStgid:");
for (g = ns->level; g <= pid->level; g++)
- seq_printf(m, "\t%d",
- task_tgid_nr_ns(p, pid->numbers[g].ns));
+ seq_put_decimal_ull(m, "\t", task_tgid_nr_ns(p, pid->numbers[g].ns));
seq_puts(m, "\nNSpid:");
for (g = ns->level; g <= pid->level; g++)
- seq_printf(m, "\t%d",
- task_pid_nr_ns(p, pid->numbers[g].ns));
+ seq_put_decimal_ull(m, "\t", task_pid_nr_ns(p, pid->numbers[g].ns));
seq_puts(m, "\nNSpgid:");
for (g = ns->level; g <= pid->level; g++)
- seq_printf(m, "\t%d",
- task_pgrp_nr_ns(p, pid->numbers[g].ns));
+ seq_put_decimal_ull(m, "\t", task_pgrp_nr_ns(p, pid->numbers[g].ns));
seq_puts(m, "\nNSsid:");
for (g = ns->level; g <= pid->level; g++)
- seq_printf(m, "\t%d",
- task_session_nr_ns(p, pid->numbers[g].ns));
+ seq_put_decimal_ull(m, "\t", task_session_nr_ns(p, pid->numbers[g].ns));
#endif
seq_putc(m, '\n');
}
unlock_task_sighand(p, &flags);
}
- seq_printf(m, "Threads:\t%d\n", num_threads);
- seq_printf(m, "SigQ:\t%lu/%lu\n", qsize, qlim);
+ seq_put_decimal_ull(m, "Threads:\t", num_threads);
+ seq_put_decimal_ull(m, "\nSigQ:\t", qsize);
+ seq_put_decimal_ull(m, "/", qlim);
/* render them all */
- render_sigset_t(m, "SigPnd:\t", &pending);
+ render_sigset_t(m, "\nSigPnd:\t", &pending);
render_sigset_t(m, "ShdPnd:\t", &shpending);
render_sigset_t(m, "SigBlk:\t", &blocked);
render_sigset_t(m, "SigIgn:\t", &ignored);
static inline void task_seccomp(struct seq_file *m, struct task_struct *p)
{
#ifdef CONFIG_SECCOMP
- seq_printf(m, "Seccomp:\t%d\n", p->seccomp.mode);
+ seq_put_decimal_ull(m, "Seccomp:\t", p->seccomp.mode);
+ seq_putc(m, '\n');
#endif
}
static inline void task_context_switch_counts(struct seq_file *m,
struct task_struct *p)
{
- seq_printf(m, "voluntary_ctxt_switches:\t%lu\n"
- "nonvoluntary_ctxt_switches:\t%lu\n",
- p->nvcsw,
- p->nivcsw);
+ seq_put_decimal_ull(m, "voluntary_ctxt_switches:\t", p->nvcsw);
+ seq_put_decimal_ull(m, "\nnonvoluntary_ctxt_switches:\t", p->nivcsw);
+ seq_putc(m, '\n');
}
static void task_cpus_allowed(struct seq_file *m, struct task_struct *task)
start_time = nsec_to_clock_t(task->real_start_time);
seq_printf(m, "%d (%s) %c", pid_nr_ns(pid, ns), tcomm, state);
- seq_put_decimal_ll(m, ' ', ppid);
- seq_put_decimal_ll(m, ' ', pgid);
- seq_put_decimal_ll(m, ' ', sid);
- seq_put_decimal_ll(m, ' ', tty_nr);
- seq_put_decimal_ll(m, ' ', tty_pgrp);
- seq_put_decimal_ull(m, ' ', task->flags);
- seq_put_decimal_ull(m, ' ', min_flt);
- seq_put_decimal_ull(m, ' ', cmin_flt);
- seq_put_decimal_ull(m, ' ', maj_flt);
- seq_put_decimal_ull(m, ' ', cmaj_flt);
- seq_put_decimal_ull(m, ' ', cputime_to_clock_t(utime));
- seq_put_decimal_ull(m, ' ', cputime_to_clock_t(stime));
- seq_put_decimal_ll(m, ' ', cputime_to_clock_t(cutime));
- seq_put_decimal_ll(m, ' ', cputime_to_clock_t(cstime));
- seq_put_decimal_ll(m, ' ', priority);
- seq_put_decimal_ll(m, ' ', nice);
- seq_put_decimal_ll(m, ' ', num_threads);
- seq_put_decimal_ull(m, ' ', 0);
- seq_put_decimal_ull(m, ' ', start_time);
- seq_put_decimal_ull(m, ' ', vsize);
- seq_put_decimal_ull(m, ' ', mm ? get_mm_rss(mm) : 0);
- seq_put_decimal_ull(m, ' ', rsslim);
- seq_put_decimal_ull(m, ' ', mm ? (permitted ? mm->start_code : 1) : 0);
- seq_put_decimal_ull(m, ' ', mm ? (permitted ? mm->end_code : 1) : 0);
- seq_put_decimal_ull(m, ' ', (permitted && mm) ? mm->start_stack : 0);
- seq_put_decimal_ull(m, ' ', esp);
- seq_put_decimal_ull(m, ' ', eip);
+ seq_put_decimal_ll(m, " ", ppid);
+ seq_put_decimal_ll(m, " ", pgid);
+ seq_put_decimal_ll(m, " ", sid);
+ seq_put_decimal_ll(m, " ", tty_nr);
+ seq_put_decimal_ll(m, " ", tty_pgrp);
+ seq_put_decimal_ull(m, " ", task->flags);
+ seq_put_decimal_ull(m, " ", min_flt);
+ seq_put_decimal_ull(m, " ", cmin_flt);
+ seq_put_decimal_ull(m, " ", maj_flt);
+ seq_put_decimal_ull(m, " ", cmaj_flt);
+ seq_put_decimal_ull(m, " ", cputime_to_clock_t(utime));
+ seq_put_decimal_ull(m, " ", cputime_to_clock_t(stime));
+ seq_put_decimal_ll(m, " ", cputime_to_clock_t(cutime));
+ seq_put_decimal_ll(m, " ", cputime_to_clock_t(cstime));
+ seq_put_decimal_ll(m, " ", priority);
+ seq_put_decimal_ll(m, " ", nice);
+ seq_put_decimal_ll(m, " ", num_threads);
+ seq_put_decimal_ull(m, " ", 0);
+ seq_put_decimal_ull(m, " ", start_time);
+ seq_put_decimal_ull(m, " ", vsize);
+ seq_put_decimal_ull(m, " ", mm ? get_mm_rss(mm) : 0);
+ seq_put_decimal_ull(m, " ", rsslim);
+ seq_put_decimal_ull(m, " ", mm ? (permitted ? mm->start_code : 1) : 0);
+ seq_put_decimal_ull(m, " ", mm ? (permitted ? mm->end_code : 1) : 0);
+ seq_put_decimal_ull(m, " ", (permitted && mm) ? mm->start_stack : 0);
+ seq_put_decimal_ull(m, " ", esp);
+ seq_put_decimal_ull(m, " ", eip);
/* The signal information here is obsolete.
* It must be decimal for Linux 2.0 compatibility.
* Use /proc/#/status for real-time signals.
*/
- seq_put_decimal_ull(m, ' ', task->pending.signal.sig[0] & 0x7fffffffUL);
- seq_put_decimal_ull(m, ' ', task->blocked.sig[0] & 0x7fffffffUL);
- seq_put_decimal_ull(m, ' ', sigign.sig[0] & 0x7fffffffUL);
- seq_put_decimal_ull(m, ' ', sigcatch.sig[0] & 0x7fffffffUL);
+ seq_put_decimal_ull(m, " ", task->pending.signal.sig[0] & 0x7fffffffUL);
+ seq_put_decimal_ull(m, " ", task->blocked.sig[0] & 0x7fffffffUL);
+ seq_put_decimal_ull(m, " ", sigign.sig[0] & 0x7fffffffUL);
+ seq_put_decimal_ull(m, " ", sigcatch.sig[0] & 0x7fffffffUL);
/*
* We used to output the absolute kernel address, but that's an
else
seq_puts(m, " 0");
- seq_put_decimal_ull(m, ' ', 0);
- seq_put_decimal_ull(m, ' ', 0);
- seq_put_decimal_ll(m, ' ', task->exit_signal);
- seq_put_decimal_ll(m, ' ', task_cpu(task));
- seq_put_decimal_ull(m, ' ', task->rt_priority);
- seq_put_decimal_ull(m, ' ', task->policy);
- seq_put_decimal_ull(m, ' ', delayacct_blkio_ticks(task));
- seq_put_decimal_ull(m, ' ', cputime_to_clock_t(gtime));
- seq_put_decimal_ll(m, ' ', cputime_to_clock_t(cgtime));
+ seq_put_decimal_ull(m, " ", 0);
+ seq_put_decimal_ull(m, " ", 0);
+ seq_put_decimal_ll(m, " ", task->exit_signal);
+ seq_put_decimal_ll(m, " ", task_cpu(task));
+ seq_put_decimal_ull(m, " ", task->rt_priority);
+ seq_put_decimal_ull(m, " ", task->policy);
+ seq_put_decimal_ull(m, " ", delayacct_blkio_ticks(task));
+ seq_put_decimal_ull(m, " ", cputime_to_clock_t(gtime));
+ seq_put_decimal_ll(m, " ", cputime_to_clock_t(cgtime));
if (mm && permitted) {
- seq_put_decimal_ull(m, ' ', mm->start_data);
- seq_put_decimal_ull(m, ' ', mm->end_data);
- seq_put_decimal_ull(m, ' ', mm->start_brk);
- seq_put_decimal_ull(m, ' ', mm->arg_start);
- seq_put_decimal_ull(m, ' ', mm->arg_end);
- seq_put_decimal_ull(m, ' ', mm->env_start);
- seq_put_decimal_ull(m, ' ', mm->env_end);
+ seq_put_decimal_ull(m, " ", mm->start_data);
+ seq_put_decimal_ull(m, " ", mm->end_data);
+ seq_put_decimal_ull(m, " ", mm->start_brk);
+ seq_put_decimal_ull(m, " ", mm->arg_start);
+ seq_put_decimal_ull(m, " ", mm->arg_end);
+ seq_put_decimal_ull(m, " ", mm->env_start);
+ seq_put_decimal_ull(m, " ", mm->env_end);
} else
- seq_printf(m, " 0 0 0 0 0 0 0");
+ seq_puts(m, " 0 0 0 0 0 0 0");
if (permitted)
- seq_put_decimal_ll(m, ' ', task->exit_code);
+ seq_put_decimal_ll(m, " ", task->exit_code);
else
- seq_put_decimal_ll(m, ' ', 0);
+ seq_puts(m, " 0");
seq_putc(m, '\n');
if (mm)
* seq_printf(m, "%lu %lu %lu %lu 0 %lu 0\n",
* size, resident, shared, text, data);
*/
- seq_put_decimal_ull(m, 0, size);
- seq_put_decimal_ull(m, ' ', resident);
- seq_put_decimal_ull(m, ' ', shared);
- seq_put_decimal_ull(m, ' ', text);
- seq_put_decimal_ull(m, ' ', 0);
- seq_put_decimal_ull(m, ' ', data);
- seq_put_decimal_ull(m, ' ', 0);
+ seq_put_decimal_ull(m, "", size);
+ seq_put_decimal_ull(m, " ", resident);
+ seq_put_decimal_ull(m, " ", shared);
+ seq_put_decimal_ull(m, " ", text);
+ seq_put_decimal_ull(m, " ", 0);
+ seq_put_decimal_ull(m, " ", data);
+ seq_put_decimal_ull(m, " ", 0);
seq_putc(m, '\n');
return 0;
if (!p)
return -ESRCH;
- if (ptrace_may_access(p, PTRACE_MODE_ATTACH_FSCREDS)) {
- task_lock(p);
- if (slack_ns == 0)
- p->timer_slack_ns = p->default_timer_slack_ns;
- else
- p->timer_slack_ns = slack_ns;
- task_unlock(p);
- } else
- count = -EPERM;
+ if (p != current) {
+ if (!capable(CAP_SYS_NICE)) {
+ count = -EPERM;
+ goto out;
+ }
+
+ err = security_task_setscheduler(p);
+ if (err) {
+ count = err;
+ goto out;
+ }
+ }
+
+ task_lock(p);
+ if (slack_ns == 0)
+ p->timer_slack_ns = p->default_timer_slack_ns;
+ else
+ p->timer_slack_ns = slack_ns;
+ task_unlock(p);
+out:
put_task_struct(p);
return count;
{
struct inode *inode = m->private;
struct task_struct *p;
- int err = 0;
+ int err = 0;
p = get_proc_task(inode);
if (!p)
return -ESRCH;
- if (ptrace_may_access(p, PTRACE_MODE_ATTACH_FSCREDS)) {
- task_lock(p);
- seq_printf(m, "%llu\n", p->timer_slack_ns);
- task_unlock(p);
- } else
- err = -EPERM;
+ if (p != current) {
+
+ if (!capable(CAP_SYS_NICE)) {
+ err = -EPERM;
+ goto out;
+ }
+ err = security_task_getscheduler(p);
+ if (err)
+ goto out;
+ }
+ task_lock(p);
+ seq_printf(m, "%llu\n", p->timer_slack_ns);
+ task_unlock(p);
+
+out:
put_task_struct(p);
return err;
put_task_struct(task);
if (files) {
- int fd = proc_fd(m->private);
+ unsigned int fd = proc_fd(m->private);
spin_lock(&files->file_lock);
file = fcheck_files(files, fd);
struct task_struct *task;
const struct cred *cred;
struct inode *inode;
- int fd;
+ unsigned int fd;
if (flags & LOOKUP_RCU)
return -ECHILD;
}
if (files) {
- int fd = proc_fd(d_inode(dentry));
+ unsigned int fd = proc_fd(d_inode(dentry));
struct file *fd_file;
spin_lock(&files->file_lock);
continue;
rcu_read_unlock();
- len = snprintf(name, sizeof(name), "%d", fd);
+ len = snprintf(name, sizeof(name), "%u", fd);
if (!proc_fill_cache(file, ctx,
name, len, instantiate, p,
(void *)(unsigned long)fd))
extern int proc_fd_permission(struct inode *inode, int mask);
-static inline int proc_fd(struct inode *inode)
+static inline unsigned int proc_fd(struct inode *inode)
{
return PROC_I(inode)->fd;
}
struct proc_inode {
struct pid *pid;
- int fd;
+ unsigned int fd;
union proc_op op;
struct proc_dir_entry *pde;
struct ctl_table_header *sysctl;
{
}
+static void show_val_kb(struct seq_file *m, const char *s, unsigned long num)
+{
+ char v[32];
+ static const char blanks[7] = {' ', ' ', ' ', ' ',' ', ' ', ' '};
+ int len;
+
+ len = num_to_str(v, sizeof(v), num << (PAGE_SHIFT - 10));
+
+ seq_write(m, s, 16);
+
+ if (len > 0) {
+ if (len < 8)
+ seq_write(m, blanks, 8 - len);
+
+ seq_write(m, v, len);
+ }
+ seq_write(m, " kB\n", 4);
+}
+
static int meminfo_proc_show(struct seq_file *m, void *v)
{
struct sysinfo i;
unsigned long pages[NR_LRU_LISTS];
int lru;
-/*
- * display in kilobytes.
- */
-#define K(x) ((x) << (PAGE_SHIFT - 10))
si_meminfo(&i);
si_swapinfo(&i);
committed = percpu_counter_read_positive(&vm_committed_as);
available = si_mem_available();
- /*
- * Tagged format, for easy grepping and expansion.
- */
- seq_printf(m,
- "MemTotal: %8lu kB\n"
- "MemFree: %8lu kB\n"
- "MemAvailable: %8lu kB\n"
- "Buffers: %8lu kB\n"
- "Cached: %8lu kB\n"
- "SwapCached: %8lu kB\n"
- "Active: %8lu kB\n"
- "Inactive: %8lu kB\n"
- "Active(anon): %8lu kB\n"
- "Inactive(anon): %8lu kB\n"
- "Active(file): %8lu kB\n"
- "Inactive(file): %8lu kB\n"
- "Unevictable: %8lu kB\n"
- "Mlocked: %8lu kB\n"
-#ifdef CONFIG_HIGHMEM
- "HighTotal: %8lu kB\n"
- "HighFree: %8lu kB\n"
- "LowTotal: %8lu kB\n"
- "LowFree: %8lu kB\n"
-#endif
-#ifndef CONFIG_MMU
- "MmapCopy: %8lu kB\n"
-#endif
- "SwapTotal: %8lu kB\n"
- "SwapFree: %8lu kB\n"
- "Dirty: %8lu kB\n"
- "Writeback: %8lu kB\n"
- "AnonPages: %8lu kB\n"
- "Mapped: %8lu kB\n"
- "Shmem: %8lu kB\n"
- "Slab: %8lu kB\n"
- "SReclaimable: %8lu kB\n"
- "SUnreclaim: %8lu kB\n"
- "KernelStack: %8lu kB\n"
- "PageTables: %8lu kB\n"
-#ifdef CONFIG_QUICKLIST
- "Quicklists: %8lu kB\n"
-#endif
- "NFS_Unstable: %8lu kB\n"
- "Bounce: %8lu kB\n"
- "WritebackTmp: %8lu kB\n"
- "CommitLimit: %8lu kB\n"
- "Committed_AS: %8lu kB\n"
- "VmallocTotal: %8lu kB\n"
- "VmallocUsed: %8lu kB\n"
- "VmallocChunk: %8lu kB\n"
-#ifdef CONFIG_MEMORY_FAILURE
- "HardwareCorrupted: %5lu kB\n"
-#endif
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- "AnonHugePages: %8lu kB\n"
- "ShmemHugePages: %8lu kB\n"
- "ShmemPmdMapped: %8lu kB\n"
-#endif
-#ifdef CONFIG_CMA
- "CmaTotal: %8lu kB\n"
- "CmaFree: %8lu kB\n"
-#endif
- ,
- K(i.totalram),
- K(i.freeram),
- K(available),
- K(i.bufferram),
- K(cached),
- K(total_swapcache_pages()),
- K(pages[LRU_ACTIVE_ANON] + pages[LRU_ACTIVE_FILE]),
- K(pages[LRU_INACTIVE_ANON] + pages[LRU_INACTIVE_FILE]),
- K(pages[LRU_ACTIVE_ANON]),
- K(pages[LRU_INACTIVE_ANON]),
- K(pages[LRU_ACTIVE_FILE]),
- K(pages[LRU_INACTIVE_FILE]),
- K(pages[LRU_UNEVICTABLE]),
- K(global_page_state(NR_MLOCK)),
+ show_val_kb(m, "MemTotal: ", i.totalram);
+ show_val_kb(m, "MemFree: ", i.freeram);
+ show_val_kb(m, "MemAvailable: ", available);
+ show_val_kb(m, "Buffers: ", i.bufferram);
+ show_val_kb(m, "Cached: ", cached);
+ show_val_kb(m, "SwapCached: ", total_swapcache_pages());
+ show_val_kb(m, "Active: ", pages[LRU_ACTIVE_ANON] +
+ pages[LRU_ACTIVE_FILE]);
+ show_val_kb(m, "Inactive: ", pages[LRU_INACTIVE_ANON] +
+ pages[LRU_INACTIVE_FILE]);
+ show_val_kb(m, "Active(anon): ", pages[LRU_ACTIVE_ANON]);
+ show_val_kb(m, "Inactive(anon): ", pages[LRU_INACTIVE_ANON]);
+ show_val_kb(m, "Active(file): ", pages[LRU_ACTIVE_FILE]);
+ show_val_kb(m, "Inactive(file): ", pages[LRU_INACTIVE_FILE]);
+ show_val_kb(m, "Unevictable: ", pages[LRU_UNEVICTABLE]);
+ show_val_kb(m, "Mlocked: ", global_page_state(NR_MLOCK));
+
#ifdef CONFIG_HIGHMEM
- K(i.totalhigh),
- K(i.freehigh),
- K(i.totalram-i.totalhigh),
- K(i.freeram-i.freehigh),
+ show_val_kb(m, "HighTotal: ", i.totalhigh);
+ show_val_kb(m, "HighFree: ", i.freehigh);
+ show_val_kb(m, "LowTotal: ", i.totalram - i.totalhigh);
+ show_val_kb(m, "LowFree: ", i.freeram - i.freehigh);
#endif
+
#ifndef CONFIG_MMU
- K((unsigned long) atomic_long_read(&mmap_pages_allocated)),
+ show_val_kb(m, "MmapCopy: ",
+ (unsigned long)atomic_long_read(&mmap_pages_allocated));
#endif
- K(i.totalswap),
- K(i.freeswap),
- K(global_node_page_state(NR_FILE_DIRTY)),
- K(global_node_page_state(NR_WRITEBACK)),
- K(global_node_page_state(NR_ANON_MAPPED)),
- K(global_node_page_state(NR_FILE_MAPPED)),
- K(i.sharedram),
- K(global_page_state(NR_SLAB_RECLAIMABLE) +
- global_page_state(NR_SLAB_UNRECLAIMABLE)),
- K(global_page_state(NR_SLAB_RECLAIMABLE)),
- K(global_page_state(NR_SLAB_UNRECLAIMABLE)),
- global_page_state(NR_KERNEL_STACK_KB),
- K(global_page_state(NR_PAGETABLE)),
+
+ show_val_kb(m, "SwapTotal: ", i.totalswap);
+ show_val_kb(m, "SwapFree: ", i.freeswap);
+ show_val_kb(m, "Dirty: ",
+ global_node_page_state(NR_FILE_DIRTY));
+ show_val_kb(m, "Writeback: ",
+ global_node_page_state(NR_WRITEBACK));
+ show_val_kb(m, "AnonPages: ",
+ global_node_page_state(NR_ANON_MAPPED));
+ show_val_kb(m, "Mapped: ",
+ global_node_page_state(NR_FILE_MAPPED));
+ show_val_kb(m, "Shmem: ", i.sharedram);
+ show_val_kb(m, "Slab: ",
+ global_page_state(NR_SLAB_RECLAIMABLE) +
+ global_page_state(NR_SLAB_UNRECLAIMABLE));
+
+ show_val_kb(m, "SReclaimable: ",
+ global_page_state(NR_SLAB_RECLAIMABLE));
+ show_val_kb(m, "SUnreclaim: ",
+ global_page_state(NR_SLAB_UNRECLAIMABLE));
+ seq_printf(m, "KernelStack: %8lu kB\n",
+ global_page_state(NR_KERNEL_STACK_KB));
+ show_val_kb(m, "PageTables: ",
+ global_page_state(NR_PAGETABLE));
#ifdef CONFIG_QUICKLIST
- K(quicklist_total_size()),
+ show_val_kb(m, "Quicklists: ", quicklist_total_size());
#endif
- K(global_node_page_state(NR_UNSTABLE_NFS)),
- K(global_page_state(NR_BOUNCE)),
- K(global_node_page_state(NR_WRITEBACK_TEMP)),
- K(vm_commit_limit()),
- K(committed),
- (unsigned long)VMALLOC_TOTAL >> 10,
- 0ul, // used to be vmalloc 'used'
- 0ul // used to be vmalloc 'largest_chunk'
+
+ show_val_kb(m, "NFS_Unstable: ",
+ global_node_page_state(NR_UNSTABLE_NFS));
+ show_val_kb(m, "Bounce: ",
+ global_page_state(NR_BOUNCE));
+ show_val_kb(m, "WritebackTmp: ",
+ global_node_page_state(NR_WRITEBACK_TEMP));
+ show_val_kb(m, "CommitLimit: ", vm_commit_limit());
+ show_val_kb(m, "Committed_AS: ", committed);
+ seq_printf(m, "VmallocTotal: %8lu kB\n",
+ (unsigned long)VMALLOC_TOTAL >> 10);
+ show_val_kb(m, "VmallocUsed: ", 0ul);
+ show_val_kb(m, "VmallocChunk: ", 0ul);
+
#ifdef CONFIG_MEMORY_FAILURE
- , atomic_long_read(&num_poisoned_pages) << (PAGE_SHIFT - 10)
+ seq_printf(m, "HardwareCorrupted: %5lu kB\n",
+ atomic_long_read(&num_poisoned_pages) << (PAGE_SHIFT - 10));
#endif
+
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- , K(global_node_page_state(NR_ANON_THPS) * HPAGE_PMD_NR)
- , K(global_node_page_state(NR_SHMEM_THPS) * HPAGE_PMD_NR)
- , K(global_node_page_state(NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR)
+ show_val_kb(m, "AnonHugePages: ",
+ global_node_page_state(NR_ANON_THPS) * HPAGE_PMD_NR);
+ show_val_kb(m, "ShmemHugePages: ",
+ global_node_page_state(NR_SHMEM_THPS) * HPAGE_PMD_NR);
+ show_val_kb(m, "ShmemPmdMapped: ",
+ global_node_page_state(NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR);
#endif
+
#ifdef CONFIG_CMA
- , K(totalcma_pages)
- , K(global_page_state(NR_FREE_CMA_PAGES))
+ show_val_kb(m, "CmaTotal: ", totalcma_pages);
+ show_val_kb(m, "CmaFree: ",
+ global_page_state(NR_FREE_CMA_PAGES));
#endif
- );
hugetlb_report_meminfo(m);
arch_report_meminfo(m);
return 0;
-#undef K
}
static int meminfo_proc_open(struct inode *inode, struct file *file)
}
sum += arch_irq_stat();
- seq_puts(p, "cpu ");
- seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(user));
- seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(nice));
- seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(system));
- seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(idle));
- seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(iowait));
- seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(irq));
- seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(softirq));
- seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(steal));
- seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(guest));
- seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(guest_nice));
+ seq_put_decimal_ull(p, "cpu ", cputime64_to_clock_t(user));
+ seq_put_decimal_ull(p, " ", cputime64_to_clock_t(nice));
+ seq_put_decimal_ull(p, " ", cputime64_to_clock_t(system));
+ seq_put_decimal_ull(p, " ", cputime64_to_clock_t(idle));
+ seq_put_decimal_ull(p, " ", cputime64_to_clock_t(iowait));
+ seq_put_decimal_ull(p, " ", cputime64_to_clock_t(irq));
+ seq_put_decimal_ull(p, " ", cputime64_to_clock_t(softirq));
+ seq_put_decimal_ull(p, " ", cputime64_to_clock_t(steal));
+ seq_put_decimal_ull(p, " ", cputime64_to_clock_t(guest));
+ seq_put_decimal_ull(p, " ", cputime64_to_clock_t(guest_nice));
seq_putc(p, '\n');
for_each_online_cpu(i) {
guest = kcpustat_cpu(i).cpustat[CPUTIME_GUEST];
guest_nice = kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE];
seq_printf(p, "cpu%d", i);
- seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(user));
- seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(nice));
- seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(system));
- seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(idle));
- seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(iowait));
- seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(irq));
- seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(softirq));
- seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(steal));
- seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(guest));
- seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(guest_nice));
+ seq_put_decimal_ull(p, " ", cputime64_to_clock_t(user));
+ seq_put_decimal_ull(p, " ", cputime64_to_clock_t(nice));
+ seq_put_decimal_ull(p, " ", cputime64_to_clock_t(system));
+ seq_put_decimal_ull(p, " ", cputime64_to_clock_t(idle));
+ seq_put_decimal_ull(p, " ", cputime64_to_clock_t(iowait));
+ seq_put_decimal_ull(p, " ", cputime64_to_clock_t(irq));
+ seq_put_decimal_ull(p, " ", cputime64_to_clock_t(softirq));
+ seq_put_decimal_ull(p, " ", cputime64_to_clock_t(steal));
+ seq_put_decimal_ull(p, " ", cputime64_to_clock_t(guest));
+ seq_put_decimal_ull(p, " ", cputime64_to_clock_t(guest_nice));
seq_putc(p, '\n');
}
- seq_printf(p, "intr %llu", (unsigned long long)sum);
+ seq_put_decimal_ull(p, "intr ", (unsigned long long)sum);
/* sum again ? it could be updated? */
for_each_irq_nr(j)
- seq_put_decimal_ull(p, ' ', kstat_irqs_usr(j));
+ seq_put_decimal_ull(p, " ", kstat_irqs_usr(j));
seq_printf(p,
"\nctxt %llu\n"
nr_running(),
nr_iowait());
- seq_printf(p, "softirq %llu", (unsigned long long)sum_softirq);
+ seq_put_decimal_ull(p, "softirq ", (unsigned long long)sum_softirq);
for (i = 0; i < NR_SOFTIRQS; i++)
- seq_put_decimal_ull(p, ' ', per_softirq_sums[i]);
+ seq_put_decimal_ull(p, " ", per_softirq_sums[i]);
seq_putc(p, '\n');
return 0;
}
mmu_notifier_invalidate_range_start(mm, 0, -1);
}
- walk_page_range(0, ~0UL, &clear_refs_walk);
+ walk_page_range(0, mm->highest_vm_end, &clear_refs_walk);
if (type == CLEAR_REFS_SOFT_DIRTY)
mmu_notifier_invalidate_range_end(mm, 0, -1);
flush_tlb_mm(mm);
/*
* A helper routine for putting decimal numbers without rich format of printf().
* only 'unsigned long long' is supported.
- * This routine will put one byte delimiter + number into seq_file.
+ * This routine will put strlen(delimiter) + number into seq_file.
* This routine is very quick when you show lots of numbers.
* In usual cases, it will be better to use seq_printf(). It's easier to read.
*/
-void seq_put_decimal_ull(struct seq_file *m, char delimiter,
+void seq_put_decimal_ull(struct seq_file *m, const char *delimiter,
unsigned long long num)
{
int len;
if (m->count + 2 >= m->size) /* we'll write 2 bytes at least */
goto overflow;
- if (delimiter)
- m->buf[m->count++] = delimiter;
+ len = strlen(delimiter);
+ if (m->count + len >= m->size)
+ goto overflow;
+
+ memcpy(m->buf + m->count, delimiter, len);
+ m->count += len;
+
+ if (m->count + 1 >= m->size)
+ goto overflow;
if (num < 10) {
m->buf[m->count++] = num + '0';
len = num_to_str(m->buf + m->count, m->size - m->count, num);
if (!len)
goto overflow;
+
m->count += len;
return;
}
EXPORT_SYMBOL(seq_put_decimal_ull);
-void seq_put_decimal_ll(struct seq_file *m, char delimiter, long long num)
+void seq_put_decimal_ll(struct seq_file *m, const char *delimiter, long long num)
{
+ int len;
+
+ if (m->count + 3 >= m->size) /* we'll write 2 bytes at least */
+ goto overflow;
+
+ len = strlen(delimiter);
+ if (m->count + len >= m->size)
+ goto overflow;
+
+ memcpy(m->buf + m->count, delimiter, len);
+ m->count += len;
+
+ if (m->count + 2 >= m->size)
+ goto overflow;
+
if (num < 0) {
- if (m->count + 3 >= m->size) {
- seq_set_overflow(m);
- return;
- }
- if (delimiter)
- m->buf[m->count++] = delimiter;
+ m->buf[m->count++] = '-';
num = -num;
- delimiter = '-';
}
- seq_put_decimal_ull(m, delimiter, num);
+
+ if (num < 10) {
+ m->buf[m->count++] = num + '0';
+ return;
+ }
+
+ len = num_to_str(m->buf + m->count, m->size - m->count, num);
+ if (!len)
+ goto overflow;
+
+ m->count += len;
+ return;
+
+overflow:
+ seq_set_overflow(m);
}
EXPORT_SYMBOL(seq_put_decimal_ll);
.open = xfs_file_open,
.release = xfs_file_release,
.fsync = xfs_file_fsync,
+ .get_unmapped_area = thp_get_unmapped_area,
.fallocate = xfs_file_fallocate,
};
*(.spinlock.text) \
VMLINUX_SYMBOL(__lock_text_end) = .;
+#define CPUIDLE_TEXT \
+ ALIGN_FUNCTION(); \
+ VMLINUX_SYMBOL(__cpuidle_text_start) = .; \
+ *(.cpuidle.text) \
+ VMLINUX_SYMBOL(__cpuidle_text_end) = .;
+
#define KPROBES_TEXT \
ALIGN_FUNCTION(); \
VMLINUX_SYMBOL(__kprobes_text_start) = .; \
#ifndef _LINUX_AUTO_DEV_IOCTL_H
#define _LINUX_AUTO_DEV_IOCTL_H
-#include <linux/auto_fs.h>
-#include <linux/string.h>
-
-#define AUTOFS_DEVICE_NAME "autofs"
-
-#define AUTOFS_DEV_IOCTL_VERSION_MAJOR 1
-#define AUTOFS_DEV_IOCTL_VERSION_MINOR 0
-
-#define AUTOFS_DEVID_LEN 16
-
-#define AUTOFS_DEV_IOCTL_SIZE sizeof(struct autofs_dev_ioctl)
-
-/*
- * An ioctl interface for autofs mount point control.
- */
-
-struct args_protover {
- __u32 version;
-};
-
-struct args_protosubver {
- __u32 sub_version;
-};
-
-struct args_openmount {
- __u32 devid;
-};
-
-struct args_ready {
- __u32 token;
-};
-
-struct args_fail {
- __u32 token;
- __s32 status;
-};
-
-struct args_setpipefd {
- __s32 pipefd;
-};
-
-struct args_timeout {
- __u64 timeout;
-};
-
-struct args_requester {
- __u32 uid;
- __u32 gid;
-};
-
-struct args_expire {
- __u32 how;
-};
-
-struct args_askumount {
- __u32 may_umount;
-};
-
-struct args_ismountpoint {
- union {
- struct args_in {
- __u32 type;
- } in;
- struct args_out {
- __u32 devid;
- __u32 magic;
- } out;
- };
-};
-
-/*
- * All the ioctls use this structure.
- * When sending a path size must account for the total length
- * of the chunk of memory otherwise is is the size of the
- * structure.
- */
-
-struct autofs_dev_ioctl {
- __u32 ver_major;
- __u32 ver_minor;
- __u32 size; /* total size of data passed in
- * including this struct */
- __s32 ioctlfd; /* automount command fd */
-
- /* Command parameters */
-
- union {
- struct args_protover protover;
- struct args_protosubver protosubver;
- struct args_openmount openmount;
- struct args_ready ready;
- struct args_fail fail;
- struct args_setpipefd setpipefd;
- struct args_timeout timeout;
- struct args_requester requester;
- struct args_expire expire;
- struct args_askumount askumount;
- struct args_ismountpoint ismountpoint;
- };
-
- char path[0];
-};
-
-static inline void init_autofs_dev_ioctl(struct autofs_dev_ioctl *in)
-{
- memset(in, 0, sizeof(struct autofs_dev_ioctl));
- in->ver_major = AUTOFS_DEV_IOCTL_VERSION_MAJOR;
- in->ver_minor = AUTOFS_DEV_IOCTL_VERSION_MINOR;
- in->size = sizeof(struct autofs_dev_ioctl);
- in->ioctlfd = -1;
-}
-
-/*
- * If you change this make sure you make the corresponding change
- * to autofs-dev-ioctl.c:lookup_ioctl()
- */
-enum {
- /* Get various version info */
- AUTOFS_DEV_IOCTL_VERSION_CMD = 0x71,
- AUTOFS_DEV_IOCTL_PROTOVER_CMD,
- AUTOFS_DEV_IOCTL_PROTOSUBVER_CMD,
-
- /* Open mount ioctl fd */
- AUTOFS_DEV_IOCTL_OPENMOUNT_CMD,
-
- /* Close mount ioctl fd */
- AUTOFS_DEV_IOCTL_CLOSEMOUNT_CMD,
-
- /* Mount/expire status returns */
- AUTOFS_DEV_IOCTL_READY_CMD,
- AUTOFS_DEV_IOCTL_FAIL_CMD,
-
- /* Activate/deactivate autofs mount */
- AUTOFS_DEV_IOCTL_SETPIPEFD_CMD,
- AUTOFS_DEV_IOCTL_CATATONIC_CMD,
-
- /* Expiry timeout */
- AUTOFS_DEV_IOCTL_TIMEOUT_CMD,
-
- /* Get mount last requesting uid and gid */
- AUTOFS_DEV_IOCTL_REQUESTER_CMD,
-
- /* Check for eligible expire candidates */
- AUTOFS_DEV_IOCTL_EXPIRE_CMD,
-
- /* Request busy status */
- AUTOFS_DEV_IOCTL_ASKUMOUNT_CMD,
-
- /* Check if path is a mountpoint */
- AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD,
-};
-
-#define AUTOFS_IOCTL 0x93
-
-#define AUTOFS_DEV_IOCTL_VERSION \
- _IOWR(AUTOFS_IOCTL, \
- AUTOFS_DEV_IOCTL_VERSION_CMD, struct autofs_dev_ioctl)
-
-#define AUTOFS_DEV_IOCTL_PROTOVER \
- _IOWR(AUTOFS_IOCTL, \
- AUTOFS_DEV_IOCTL_PROTOVER_CMD, struct autofs_dev_ioctl)
-
-#define AUTOFS_DEV_IOCTL_PROTOSUBVER \
- _IOWR(AUTOFS_IOCTL, \
- AUTOFS_DEV_IOCTL_PROTOSUBVER_CMD, struct autofs_dev_ioctl)
-
-#define AUTOFS_DEV_IOCTL_OPENMOUNT \
- _IOWR(AUTOFS_IOCTL, \
- AUTOFS_DEV_IOCTL_OPENMOUNT_CMD, struct autofs_dev_ioctl)
-
-#define AUTOFS_DEV_IOCTL_CLOSEMOUNT \
- _IOWR(AUTOFS_IOCTL, \
- AUTOFS_DEV_IOCTL_CLOSEMOUNT_CMD, struct autofs_dev_ioctl)
-
-#define AUTOFS_DEV_IOCTL_READY \
- _IOWR(AUTOFS_IOCTL, \
- AUTOFS_DEV_IOCTL_READY_CMD, struct autofs_dev_ioctl)
-
-#define AUTOFS_DEV_IOCTL_FAIL \
- _IOWR(AUTOFS_IOCTL, \
- AUTOFS_DEV_IOCTL_FAIL_CMD, struct autofs_dev_ioctl)
-
-#define AUTOFS_DEV_IOCTL_SETPIPEFD \
- _IOWR(AUTOFS_IOCTL, \
- AUTOFS_DEV_IOCTL_SETPIPEFD_CMD, struct autofs_dev_ioctl)
-
-#define AUTOFS_DEV_IOCTL_CATATONIC \
- _IOWR(AUTOFS_IOCTL, \
- AUTOFS_DEV_IOCTL_CATATONIC_CMD, struct autofs_dev_ioctl)
-
-#define AUTOFS_DEV_IOCTL_TIMEOUT \
- _IOWR(AUTOFS_IOCTL, \
- AUTOFS_DEV_IOCTL_TIMEOUT_CMD, struct autofs_dev_ioctl)
-
-#define AUTOFS_DEV_IOCTL_REQUESTER \
- _IOWR(AUTOFS_IOCTL, \
- AUTOFS_DEV_IOCTL_REQUESTER_CMD, struct autofs_dev_ioctl)
-
-#define AUTOFS_DEV_IOCTL_EXPIRE \
- _IOWR(AUTOFS_IOCTL, \
- AUTOFS_DEV_IOCTL_EXPIRE_CMD, struct autofs_dev_ioctl)
-
-#define AUTOFS_DEV_IOCTL_ASKUMOUNT \
- _IOWR(AUTOFS_IOCTL, \
- AUTOFS_DEV_IOCTL_ASKUMOUNT_CMD, struct autofs_dev_ioctl)
-
-#define AUTOFS_DEV_IOCTL_ISMOUNTPOINT \
- _IOWR(AUTOFS_IOCTL, \
- AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD, struct autofs_dev_ioctl)
-
+#include <uapi/linux/auto_dev-ioctl.h>
#endif /* _LINUX_AUTO_DEV_IOCTL_H */
#define _LINUX_AUTO_FS_H
#include <linux/fs.h>
-#include <linux/limits.h>
#include <linux/ioctl.h>
#include <uapi/linux/auto_fs.h>
#endif /* _LINUX_AUTO_FS_H */
return order; /* We could be slightly more clever with -1 here... */
}
-static inline int get_count_order(unsigned int count)
-{
- int order;
-
- order = fls(count) - 1;
- if (count & (count - 1))
- order++;
- return order;
-}
-
static __always_inline unsigned long hweight_long(unsigned long w)
{
return sizeof(w) == 4 ? hweight32(w) : hweight64(w);
return fls64(l);
}
+static inline int get_count_order(unsigned int count)
+{
+ int order;
+
+ order = fls(count) - 1;
+ if (count & (count - 1))
+ order++;
+ return order;
+}
+
+/**
+ * get_count_order_long - get order after rounding @l up to power of 2
+ * @l: parameter
+ *
+ * it is same as get_count_order() but with long type parameter
+ */
+static inline int get_count_order_long(unsigned long l)
+{
+ if (l == 0UL)
+ return -1;
+ else if (l & (l - 1UL))
+ return (int)fls_long(l);
+ else
+ return (int)fls_long(l) - 1;
+}
+
/**
* __ffs64 - find first set bit in a 64 bit word
* @word: The 64 bit word
#include <linux/mmzone.h>
#include <linux/mm_types.h>
#include <asm/dma.h>
+#include <asm/processor.h>
/*
* simple boot-time physical memory area allocator.
#define BOOTMEM_LOW_LIMIT __pa(MAX_DMA_ADDRESS)
#endif
+#ifndef ARCH_LOW_ADDRESS_LIMIT
+#define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL
+#endif
+
#define alloc_bootmem(x) \
__alloc_bootmem(x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT)
#define alloc_bootmem_align(x, align) \
NUMA_NO_NODE);
}
-#ifndef ARCH_LOW_ADDRESS_LIMIT
-#define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL
-#endif
-
static inline void * __init memblock_virt_alloc_low(
phys_addr_t size, phys_addr_t align)
{
* Lower value means higher priority, analogically to reclaim priority.
*/
enum compact_priority {
+ COMPACT_PRIO_SYNC_FULL,
+ MIN_COMPACT_PRIORITY = COMPACT_PRIO_SYNC_FULL,
COMPACT_PRIO_SYNC_LIGHT,
- MIN_COMPACT_PRIORITY = COMPACT_PRIO_SYNC_LIGHT,
+ MIN_COMPACT_COSTLY_PRIORITY = COMPACT_PRIO_SYNC_LIGHT,
DEF_COMPACT_PRIORITY = COMPACT_PRIO_SYNC_LIGHT,
COMPACT_PRIO_ASYNC,
INIT_COMPACT_PRIORITY = COMPACT_PRIO_ASYNC
COMPACT_CONTENDED,
/*
- * direct compaction partially compacted a zone and there might be
- * suitable pages
+ * direct compaction terminated after concluding that the allocation
+ * should now succeed
*/
- COMPACT_PARTIAL,
+ COMPACT_SUCCESS,
};
struct alloc_context; /* in mm/internal.h */
+/*
+ * Number of free order-0 pages that should be available above given watermark
+ * to make sure compaction has reasonable chance of not running out of free
+ * pages that it needs to isolate as migration target during its work.
+ */
+static inline unsigned long compact_gap(unsigned int order)
+{
+ /*
+ * Although all the isolations for migration are temporary, compaction
+ * free scanner may have up to 1 << order pages on its list and then
+ * try to split an (order - 1) free page. At that point, a gap of
+ * 1 << order might not be enough, so it's safer to require twice that
+ * amount. Note that the number of pages on the list is also
+ * effectively limited by COMPACT_CLUSTER_MAX, as that's the maximum
+ * that the migrate scanner can have isolated on migrate list, and free
+ * scanner is only invoked when the number of isolated free pages is
+ * lower than that. But it's not worth to complicate the formula here
+ * as a bigger gap for higher orders than strictly necessary can also
+ * improve chances of compaction success.
+ */
+ return 2UL << order;
+}
+
#ifdef CONFIG_COMPACTION
extern int sysctl_compact_memory;
extern int sysctl_compaction_handler(struct ctl_table *table, int write,
extern enum compact_result try_to_compact_pages(gfp_t gfp_mask,
unsigned int order, unsigned int alloc_flags,
const struct alloc_context *ac, enum compact_priority prio);
-extern void compact_pgdat(pg_data_t *pgdat, int order);
extern void reset_isolation_suitable(pg_data_t *pgdat);
extern enum compact_result compaction_suitable(struct zone *zone, int order,
unsigned int alloc_flags, int classzone_idx);
* that the compaction successfully isolated and migrated some
* pageblocks.
*/
- if (result == COMPACT_PARTIAL)
+ if (result == COMPACT_SUCCESS)
return true;
return false;
extern void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx);
#else
-static inline void compact_pgdat(pg_data_t *pgdat, int order)
-{
-}
-
static inline void reset_isolation_suitable(pg_data_t *pgdat)
{
}
asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp);
-extern __printf(1, 2) int compat_printk(const char *fmt, ...);
extern void sigset_from_compat(sigset_t *set, const compat_sigset_t *compat);
extern void sigset_to_compat(compat_sigset_t *compat, const sigset_t *set);
#endif
extern bool console_suspend_enabled;
+#ifdef CONFIG_OF
+extern void console_set_by_of(void);
+#else
+static inline void console_set_by_of(void) {}
+#endif
+
/* Suspend and resume console messages over PM events */
extern void suspend_console(void);
extern void resume_console(void);
void cpu_idle_poll_ctrl(bool enable);
+/* Attach to any functions which should be considered cpuidle. */
+#define __cpuidle __attribute__((__section__(".cpuidle.text")))
+
+bool cpu_in_idle(unsigned long pc);
+
void arch_cpu_idle(void);
void arch_cpu_idle_prepare(void);
void arch_cpu_idle_enter(void);
--- /dev/null
+/*
+ * Copyright 2013 Freescale Semiconductor Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Freescale Semiconductor nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ *
+ * ALTERNATIVELY, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") as published by the Free Software
+ * Foundation, either version 2 of that License or (at your option) any
+ * later version.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __CRC64_ECMA_H_
+#define __CRC64_ECMA_H_
+
+#include <linux/types.h>
+
+
+#define CRC64_DEFAULT_INITVAL 0xFFFFFFFFFFFFFFFFULL
+
+
+/*
+ * crc64_ecma_seed - Initializes the CRC64 ECMA seed.
+ */
+u64 crc64_ecma_seed(void);
+
+/*
+ * crc64_ecma - Computes the 64 bit ECMA CRC.
+ *
+ * @pdata: pointer to the data to compute checksum for.
+ * @nbytes: number of bytes in data buffer.
+ * @seed: CRC seed.
+ */
+u64 crc64_ecma(u8 const *pdata, u32 nbytes, u64 seed);
+
+#endif /* __CRC64_ECMA_H_ */
/*
* COW Supplementary groups list
*/
-#define NGROUPS_SMALL 32
-#define NGROUPS_PER_BLOCK ((unsigned int)(PAGE_SIZE / sizeof(kgid_t)))
-
struct group_info {
atomic_t usage;
int ngroups;
- int nblocks;
- kgid_t small_block[NGROUPS_SMALL];
- kgid_t *blocks[0];
+ kgid_t gid[0];
};
/**
extern int groups_search(const struct group_info *, kgid_t);
extern bool may_setgroups(void);
-/* access the groups "array" with this macro */
-#define GROUP_AT(gi, i) \
- ((gi)->blocks[(i) / NGROUPS_PER_BLOCK][(i) % NGROUPS_PER_BLOCK])
-
/*
* The security context of a task
*
#define isalnum(c) ((__ismask(c)&(_U|_L|_D)) != 0)
#define isalpha(c) ((__ismask(c)&(_U|_L)) != 0)
#define iscntrl(c) ((__ismask(c)&(_C)) != 0)
-#define isdigit(c) ((__ismask(c)&(_D)) != 0)
+static inline int isdigit(int c)
+{
+ return '0' <= c && c <= '9';
+}
#define isgraph(c) ((__ismask(c)&(_P|_U|_L|_D)) != 0)
#define islower(c) ((__ismask(c)&(_L)) != 0)
#define isprint(c) ((__ismask(c)&(_P|_U|_L|_D|_SP)) != 0)
* that gives better TLB efficiency.
*/
#define DMA_ATTR_ALLOC_SINGLE_PAGES (1UL << 7)
+/*
+ * DMA_ATTR_NO_WARN: This tells the DMA-mapping subsystem to suppress
+ * allocation failure reports (similarly to __GFP_NOWARN).
+ */
+#define DMA_ATTR_NO_WARN (1UL << 8)
/*
* A dma_addr_t can hold any valid DMA or bus address for the platform.
extern unsigned long transparent_hugepage_flags;
+extern unsigned long thp_get_unmapped_area(struct file *filp,
+ unsigned long addr, unsigned long len, unsigned long pgoff,
+ unsigned long flags);
+
extern void prep_transhuge_page(struct page *page);
extern void free_transhuge_page(struct page *page);
return is_huge_zero_page(pmd_page(pmd));
}
-struct page *get_huge_zero_page(void);
-void put_huge_zero_page(void);
+struct page *mm_get_huge_zero_page(struct mm_struct *mm);
+void mm_put_huge_zero_page(struct mm_struct *mm);
#define mk_huge_pmd(page, prot) pmd_mkhuge(mk_pmd(page, prot))
static inline void prep_transhuge_page(struct page *page) {}
#define transparent_hugepage_flags 0UL
+
+#define thp_get_unmapped_area NULL
+
static inline int
split_huge_page_to_list(struct page *page, struct list_head *list)
{
return false;
}
-static inline void put_huge_zero_page(void)
+static inline void mm_put_huge_zero_page(struct mm_struct *mm)
{
- BUILD_BUG();
+ return;
}
static inline struct page *follow_devmap_pmd(struct vm_area_struct *vma,
#define __LINUX_KBUILD_H
#define DEFINE(sym, val) \
- asm volatile("\n->" #sym " %0 " #val : : "i" (val))
+ asm volatile ("#define " #sym " %0 /*" #val :: "i" (val))
#define BLANK() asm volatile("\n->" : : )
* strict type-checking.. See the
* "unnecessary" pointer comparison.
*/
-#define min(x, y) ({ \
- typeof(x) _min1 = (x); \
- typeof(y) _min2 = (y); \
- (void) (&_min1 == &_min2); \
- _min1 < _min2 ? _min1 : _min2; })
-
-#define max(x, y) ({ \
- typeof(x) _max1 = (x); \
- typeof(y) _max2 = (y); \
- (void) (&_max1 == &_max2); \
- _max1 > _max2 ? _max1 : _max2; })
+#define __min(t1, t2, min1, min2, x, y) ({ \
+ t1 min1 = (x); \
+ t2 min2 = (y); \
+ (void) (&min1 == &min2); \
+ min1 < min2 ? min1 : min2; })
+#define min(x, y) \
+ __min(typeof(x), typeof(y), \
+ __UNIQUE_ID(min1_), __UNIQUE_ID(min2_), \
+ x, y)
+
+#define __max(t1, t2, max1, max2, x, y) ({ \
+ t1 max1 = (x); \
+ t2 max2 = (y); \
+ (void) (&max1 == &max2); \
+ max1 > max2 ? max1 : max2; })
+#define max(x, y) \
+ __max(typeof(x), typeof(y), \
+ __UNIQUE_ID(max1_), __UNIQUE_ID(max2_), \
+ x, y)
#define min3(x, y, z) min((typeof(x))min(x, y), z)
#define max3(x, y, z) max((typeof(x))max(x, y), z)
*
* Or not use min/max/clamp at all, of course.
*/
-#define min_t(type, x, y) ({ \
- type __min1 = (x); \
- type __min2 = (y); \
- __min1 < __min2 ? __min1: __min2; })
-
-#define max_t(type, x, y) ({ \
- type __max1 = (x); \
- type __max2 = (y); \
- __max1 > __max2 ? __max1: __max2; })
+#define min_t(type, x, y) \
+ __min(type, type, \
+ __UNIQUE_ID(min1_), __UNIQUE_ID(min2_), \
+ x, y)
+
+#define max_t(type, x, y) \
+ __max(type, type, \
+ __UNIQUE_ID(min1_), __UNIQUE_ID(min2_), \
+ x, y)
/**
* clamp_t - return a value clamped to a given range using a given type
vmcoreinfo_append_str("NUMBER(%s)=%ld\n", #name, (long)name)
#define VMCOREINFO_CONFIG(name) \
vmcoreinfo_append_str("CONFIG_%s=y\n", #name)
+#define VMCOREINFO_PHYS_BASE(value) \
+ vmcoreinfo_append_str("PHYS_BASE=%lx\n", (unsigned long)value)
extern struct kimage *kexec_image;
extern struct kimage *kexec_crash_image;
phys_addr_t __memblock_alloc_base(phys_addr_t size, phys_addr_t align,
phys_addr_t max_addr);
phys_addr_t memblock_phys_mem_size(void);
+phys_addr_t memblock_reserved_size(void);
phys_addr_t memblock_mem_size(unsigned long limit_pfn);
phys_addr_t memblock_start_of_DRAM(void);
phys_addr_t memblock_end_of_DRAM(void);
struct mem_cgroup *,
struct mem_cgroup_reclaim_cookie *);
void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *);
+int mem_cgroup_scan_tasks(struct mem_cgroup *,
+ int (*)(struct task_struct *, void *), void *);
static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
{
void mem_cgroup_handle_over_high(void);
+unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg);
+
void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
struct task_struct *p);
{
}
+static inline int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
+ int (*fn)(struct task_struct *, void *), void *arg)
+{
+ return 0;
+}
+
static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
{
return 0;
return 0;
}
+static inline unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
+{
+ return 0;
+}
+
static inline void
mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
{
return page->mapping;
}
-/*
- * Return the pagecache index of the passed page. Regular pagecache pages
- * use ->index whereas swapcache pages use ->private
- */
-static inline pgoff_t page_index(struct page *page)
-{
- if (unlikely(PageSwapCache(page)))
- return page_private(page);
- return page->index;
-}
-
extern pgoff_t __page_file_index(struct page *page);
/*
- * Return the file index of the page. Regular pagecache pages use ->index
- * whereas swapcache pages use swp_offset(->private)
+ * Return the pagecache index of the passed page. Regular pagecache pages
+ * use ->index whereas swapcache pages use swp_offset(->private)
*/
-static inline pgoff_t page_file_index(struct page *page)
+static inline pgoff_t page_index(struct page *page)
{
if (unlikely(PageSwapCache(page)))
return __page_file_index(page);
-
return page->index;
}
* @pte_hole: if set, called for each hole at all levels
* @hugetlb_entry: if set, called for each hugetlb entry
* @test_walk: caller specific callback function to determine whether
- * we walk over the current vma or not. A positive returned
+ * we walk over the current vma or not. Returning 0
* value means "do page table walk over the current vma,"
* and a negative one means "abort current page table walk
- * right now." 0 means "skip the current vma."
+ * right now." 1 means "skip the current vma."
* @mm: mm_struct representing the target process of page table walk
* @vma: vma currently walked (NULL if walking outside vmas)
* @private: private data for callbacks' usage
struct mminit_pfnnid_cache *state);
#endif
-extern void set_dma_reserve(unsigned long new_dma_reserve);
+extern void set_memory_reserve(unsigned long nr_reserve, bool inc);
extern void memmap_init_zone(unsigned long, int, unsigned long,
unsigned long, enum memmap_context);
extern void setup_per_zone_wmarks(void);
extern long si_mem_available(void);
extern void si_meminfo(struct sysinfo * val);
extern void si_meminfo_node(struct sysinfo *val, int nid);
+#ifdef __HAVE_ARCH_RESERVED_KERNEL_PAGES
+extern unsigned long arch_reserved_kernel_pages(void);
+#endif
extern __printf(3, 4)
void warn_alloc_failed(gfp_t gfp_mask, unsigned int order,
#ifdef CONFIG_HUGETLB_PAGE
atomic_long_t hugetlb_usage;
#endif
-#ifdef CONFIG_MMU
struct work_struct async_put_work;
-#endif
};
static inline void mm_init_cpumask(struct mm_struct *mm)
* base function. Return whether such support was available,
* to allow calling code to fall back to some other mechanism:
*/
-#ifdef arch_trigger_all_cpu_backtrace
+#ifdef arch_trigger_cpumask_backtrace
static inline bool trigger_all_cpu_backtrace(void)
{
- arch_trigger_all_cpu_backtrace(true);
-
+ arch_trigger_cpumask_backtrace(cpu_online_mask, false);
return true;
}
+
static inline bool trigger_allbutself_cpu_backtrace(void)
{
- arch_trigger_all_cpu_backtrace(false);
+ arch_trigger_cpumask_backtrace(cpu_online_mask, true);
+ return true;
+}
+
+static inline bool trigger_cpumask_backtrace(struct cpumask *mask)
+{
+ arch_trigger_cpumask_backtrace(mask, false);
+ return true;
+}
+
+static inline bool trigger_single_cpu_backtrace(int cpu)
+{
+ arch_trigger_cpumask_backtrace(cpumask_of(cpu), false);
return true;
}
/* generic implementation */
-void nmi_trigger_all_cpu_backtrace(bool include_self,
+void nmi_trigger_cpumask_backtrace(const cpumask_t *mask,
+ bool exclude_self,
void (*raise)(cpumask_t *mask));
bool nmi_cpu_backtrace(struct pt_regs *regs);
{
return false;
}
+static inline bool trigger_cpumask_backtrace(struct cpumask *mask)
+{
+ return false;
+}
+static inline bool trigger_single_cpu_backtrace(int cpu)
+{
+ return false;
+}
#endif
#ifdef CONFIG_LOCKUP_DETECTOR
* for display purposes.
*/
const int order;
-};
-
-/*
- * Types of limitations to the nodes from which allocations may occur
- */
-enum oom_constraint {
- CONSTRAINT_NONE,
- CONSTRAINT_CPUSET,
- CONSTRAINT_MEMORY_POLICY,
- CONSTRAINT_MEMCG,
-};
-enum oom_scan_t {
- OOM_SCAN_OK, /* scan thread and find its badness */
- OOM_SCAN_CONTINUE, /* do not consider thread for oom kill */
- OOM_SCAN_ABORT, /* abort the iteration and return */
- OOM_SCAN_SELECT, /* always select this thread first */
+ /* Used by oom implementation, do not set */
+ unsigned long totalpages;
+ struct task_struct *chosen;
+ unsigned long chosen_points;
};
extern struct mutex oom_lock;
return p->signal->oom_flag_origin;
}
-extern void mark_oom_victim(struct task_struct *tsk);
-
-#ifdef CONFIG_MMU
-extern void wake_oom_reaper(struct task_struct *tsk);
-#else
-static inline void wake_oom_reaper(struct task_struct *tsk)
+static inline bool tsk_is_oom_victim(struct task_struct * tsk)
{
+ return tsk->signal->oom_mm;
}
-#endif
extern unsigned long oom_badness(struct task_struct *p,
struct mem_cgroup *memcg, const nodemask_t *nodemask,
unsigned long totalpages);
-extern void oom_kill_process(struct oom_control *oc, struct task_struct *p,
- unsigned int points, unsigned long totalpages,
- const char *message);
-
-extern void check_panic_on_oom(struct oom_control *oc,
- enum oom_constraint constraint);
-
-extern enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
- struct task_struct *task);
-
extern bool out_of_memory(struct oom_control *oc);
-extern void exit_oom_victim(struct task_struct *tsk);
+extern void exit_oom_victim(void);
extern int register_oom_notifier(struct notifier_block *nb);
extern int unregister_oom_notifier(struct notifier_block *nb);
-extern bool oom_killer_disabled;
-extern bool oom_killer_disable(void);
+extern bool oom_killer_disable(signed long timeout);
extern void oom_killer_enable(void);
extern struct task_struct *find_lock_task_mm(struct task_struct *p);
-bool task_will_free_mem(struct task_struct *task);
-
/* sysctls */
extern int sysctl_oom_dump_tasks;
extern int sysctl_oom_kill_allocating_task;
struct pglist_data;
struct page_ext_operations {
+ size_t offset;
+ size_t size;
bool (*need)(void);
void (*init)(void);
};
*/
struct page_ext {
unsigned long flags;
-#ifdef CONFIG_PAGE_OWNER
- unsigned int order;
- gfp_t gfp_mask;
- int last_migrate_reason;
- depot_stack_handle_t handle;
-#endif
};
extern void pgdat_page_ext_init(struct pglist_data *pgdat);
extern void __copy_page_owner(struct page *oldpage, struct page *newpage);
extern void __set_page_owner_migrate_reason(struct page *page, int reason);
extern void __dump_page_owner(struct page *page);
+extern void pagetypeinfo_showmixedcount_print(struct seq_file *m,
+ pg_data_t *pgdat, struct zone *zone);
static inline void reset_page_owner(struct page *page, unsigned int order)
{
AS_MM_ALL_LOCKS = __GFP_BITS_SHIFT + 2, /* under mm_take_all_locks() */
AS_UNEVICTABLE = __GFP_BITS_SHIFT + 3, /* e.g., ramdisk, SHM_LOCK */
AS_EXITING = __GFP_BITS_SHIFT + 4, /* final truncate in progress */
+ /* writeback related tags are not used */
+ AS_NO_WRITEBACK_TAGS = __GFP_BITS_SHIFT + 5,
+
+ AS_LAST_FLAG,
};
static inline void mapping_set_error(struct address_space *mapping, int error)
return test_bit(AS_EXITING, &mapping->flags);
}
+static inline void mapping_set_no_writeback_tags(struct address_space *mapping)
+{
+ set_bit(AS_NO_WRITEBACK_TAGS, &mapping->flags);
+}
+
+static inline int mapping_use_writeback_tags(struct address_space *mapping)
+{
+ return !test_bit(AS_NO_WRITEBACK_TAGS, &mapping->flags);
+}
+
static inline gfp_t mapping_gfp_mask(struct address_space * mapping)
{
return (__force gfp_t)mapping->flags & __GFP_BITS_MASK;
static inline loff_t page_file_offset(struct page *page)
{
- return ((loff_t)page_file_index(page)) << PAGE_SHIFT;
+ return ((loff_t)page_index(page)) << PAGE_SHIFT;
}
extern pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
*
* This function updates @iter->index in the case of a successful lookup.
* For tagged lookup it also eats @iter->tags.
+ *
+ * There are several cases where 'slot' can be passed in as NULL to this
+ * function. These cases result from the use of radix_tree_iter_next() or
+ * radix_tree_iter_retry(). In these cases we don't end up dereferencing
+ * 'slot' because either:
+ * a) we are doing tagged iteration and iter->tags has been set to 0, or
+ * b) we are doing non-tagged iteration, and iter->index and iter->next_index
+ * have been set up so that radix_tree_chunk_size() returns 1 or 0.
*/
static __always_inline void **
radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags)
unsigned int get_random_int(void);
unsigned long get_random_long(void);
-unsigned long randomize_range(unsigned long start, unsigned long end, unsigned long len);
+unsigned long randomize_page(unsigned long start, unsigned long range);
u32 prandom_u32(void);
void prandom_bytes(void *buf, size_t nbytes);
#include <linux/poll.h>
#include <linux/kref.h>
#include <linux/percpu.h>
+#include <linux/irq_work.h>
/*
* Tracks changes to rchan/rchan_buf structs
size_t subbufs_consumed; /* count of sub-buffers consumed */
struct rchan *chan; /* associated channel */
wait_queue_head_t read_wait; /* reader wait queue */
- struct timer_list timer; /* reader wake-up timer */
+ struct irq_work wakeup_work; /* reader wakeup */
struct dentry *dentry; /* channel file dentry */
struct kref kref; /* channel buffer refcount */
struct page **page_array; /* array of current buffer pages */
#define MMF_HAS_UPROBES 19 /* has uprobes */
#define MMF_RECALC_UPROBES 20 /* MMF_HAS_UPROBES can be wrong */
-#define MMF_OOM_REAPED 21 /* mm has been already reaped */
-#define MMF_OOM_NOT_REAPABLE 22 /* mm couldn't be reaped */
+#define MMF_OOM_SKIP 21 /* mm is of no interest for the OOM killer */
+#define MMF_UNSTABLE 22 /* mm is unstable for copy_from_user */
+#define MMF_HUGE_ZERO_PAGE 23 /* mm has ever used the global huge zero page */
#define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK)
atomic_t sigcnt;
atomic_t live;
int nr_threads;
- atomic_t oom_victims; /* # of TIF_MEDIE threads in this thread group */
struct list_head thread_head;
wait_queue_head_t wait_chldexit; /* for wait4() */
short oom_score_adj; /* OOM kill score adjustment */
short oom_score_adj_min; /* OOM kill score adjustment min value.
* Only settable by CAP_SYS_RESOURCE. */
+ struct mm_struct *oom_mm; /* recorded mm when the thread group got
+ * killed by the oom killer */
struct mutex cred_guard_mutex; /* guard against foreign influences on
* credential calculations
__mmdrop(mm);
}
+static inline void mmdrop_async_fn(struct work_struct *work)
+{
+ struct mm_struct *mm = container_of(work, struct mm_struct, async_put_work);
+ __mmdrop(mm);
+}
+
+static inline void mmdrop_async(struct mm_struct *mm)
+{
+ if (unlikely(atomic_dec_and_test(&mm->mm_count))) {
+ INIT_WORK(&mm->async_put_work, mmdrop_async_fn);
+ schedule_work(&mm->async_put_work);
+ }
+}
+
static inline bool mmget_not_zero(struct mm_struct *mm)
{
return atomic_inc_not_zero(&mm->mm_users);
struct list_head list_id; /* undo requests on this array */
int sem_nsems; /* no. of semaphores in array */
int complex_count; /* pending complex operations */
+ bool complex_mode; /* no parallel simple ops */
};
#ifdef CONFIG_SYSVIPC
void seq_printf(struct seq_file *m, const char *fmt, ...);
void seq_putc(struct seq_file *m, char c);
void seq_puts(struct seq_file *m, const char *s);
-void seq_put_decimal_ull(struct seq_file *m, char delimiter,
+void seq_put_decimal_ull(struct seq_file *m, const char *delimiter,
unsigned long long num);
-void seq_put_decimal_ll(struct seq_file *m, char delimiter, long long num);
+void seq_put_decimal_ll(struct seq_file *m, const char *delimiter, long long num);
void seq_escape(struct seq_file *m, const char *s, const char *esc);
void seq_hex_dump(struct seq_file *m, const char *prefix_str, int prefix_type,
unsigned int next; /* Likely next allocation offset */
};
+struct swap_cluster_list {
+ struct swap_cluster_info head;
+ struct swap_cluster_info tail;
+};
+
/*
* The in-memory structure used to track swap areas.
*/
unsigned int max; /* extent of the swap_map */
unsigned char *swap_map; /* vmalloc'ed array of usage counts */
struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */
- struct swap_cluster_info free_cluster_head; /* free cluster list head */
- struct swap_cluster_info free_cluster_tail; /* free cluster list tail */
+ struct swap_cluster_list free_clusters; /* free clusters list */
unsigned int lowest_bit; /* index of first free in swap_map */
unsigned int highest_bit; /* index of last free in swap_map */
unsigned int pages; /* total of usable pages of swap */
* first.
*/
struct work_struct discard_work; /* discard worker */
- struct swap_cluster_info discard_cluster_head; /* list head of discard clusters */
- struct swap_cluster_info discard_cluster_tail; /* list tail of discard clusters */
+ struct swap_cluster_list discard_clusters; /* discard clusters list */
};
/* linux/mm/workingset.c */
#else
static inline void laptop_sync_completion(void) { }
#endif
-void throttle_vm_writeout(gfp_t gfp_mask);
bool node_dirty_ok(struct pglist_data *pgdat);
int wb_domain_init(struct wb_domain *dom, gfp_t gfp);
#ifdef CONFIG_CGROUP_WRITEBACK
EM( COMPACT_SKIPPED, "skipped") \
EM( COMPACT_DEFERRED, "deferred") \
EM( COMPACT_CONTINUE, "continue") \
- EM( COMPACT_PARTIAL, "partial") \
+ EM( COMPACT_SUCCESS, "success") \
EM( COMPACT_PARTIAL_SKIPPED, "partial_skipped") \
EM( COMPACT_COMPLETE, "complete") \
EM( COMPACT_NO_SUITABLE_PAGE, "no_suitable_page") \
--- /dev/null
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM zsmalloc
+
+#if !defined(_TRACE_ZSMALLOC_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_ZSMALLOC_H
+
+#include <linux/types.h>
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(zs_compact_start,
+
+ TP_PROTO(const char *pool_name),
+
+ TP_ARGS(pool_name),
+
+ TP_STRUCT__entry(
+ __field(const char *, pool_name)
+ ),
+
+ TP_fast_assign(
+ __entry->pool_name = pool_name;
+ ),
+
+ TP_printk("pool %s",
+ __entry->pool_name)
+);
+
+TRACE_EVENT(zs_compact_end,
+
+ TP_PROTO(const char *pool_name, unsigned long pages_compacted),
+
+ TP_ARGS(pool_name, pages_compacted),
+
+ TP_STRUCT__entry(
+ __field(const char *, pool_name)
+ __field(unsigned long, pages_compacted)
+ ),
+
+ TP_fast_assign(
+ __entry->pool_name = pool_name;
+ __entry->pages_compacted = pages_compacted;
+ ),
+
+ TP_printk("pool %s: %ld pages compacted",
+ __entry->pool_name,
+ __entry->pages_compacted)
+);
+
+TRACE_EVENT(zs_compact,
+
+ TP_PROTO(int class, unsigned long nr_migrated_obj, unsigned long nr_freed_pages),
+
+ TP_ARGS(class, nr_migrated_obj, nr_freed_pages),
+
+ TP_STRUCT__entry(
+ __field(int, class)
+ __field(unsigned long, nr_migrated_obj)
+ __field(unsigned long, nr_freed_pages)
+ ),
+
+ TP_fast_assign(
+ __entry->class = class;
+ __entry->nr_migrated_obj = nr_migrated_obj;
+ __entry->nr_freed_pages = nr_freed_pages;
+ ),
+
+ TP_printk("class %3d: %ld objects migrated, %ld pages freed",
+ __entry->class,
+ __entry->nr_migrated_obj,
+ __entry->nr_freed_pages)
+);
+
+#endif /* _TRACE_ZSMALLOC_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
--- /dev/null
+/*
+ * Copyright 2008 Red Hat, Inc. All rights reserved.
+ * Copyright 2008 Ian Kent <raven@themaw.net>
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ */
+
+#ifndef _UAPI_LINUX_AUTO_DEV_IOCTL_H
+#define _UAPI_LINUX_AUTO_DEV_IOCTL_H
+
+#include <linux/auto_fs.h>
+#include <linux/string.h>
+
+#define AUTOFS_DEVICE_NAME "autofs"
+
+#define AUTOFS_DEV_IOCTL_VERSION_MAJOR 1
+#define AUTOFS_DEV_IOCTL_VERSION_MINOR 0
+
+#define AUTOFS_DEV_IOCTL_SIZE sizeof(struct autofs_dev_ioctl)
+
+/*
+ * An ioctl interface for autofs mount point control.
+ */
+
+struct args_protover {
+ __u32 version;
+};
+
+struct args_protosubver {
+ __u32 sub_version;
+};
+
+struct args_openmount {
+ __u32 devid;
+};
+
+struct args_ready {
+ __u32 token;
+};
+
+struct args_fail {
+ __u32 token;
+ __s32 status;
+};
+
+struct args_setpipefd {
+ __s32 pipefd;
+};
+
+struct args_timeout {
+ __u64 timeout;
+};
+
+struct args_requester {
+ __u32 uid;
+ __u32 gid;
+};
+
+struct args_expire {
+ __u32 how;
+};
+
+struct args_askumount {
+ __u32 may_umount;
+};
+
+struct args_ismountpoint {
+ union {
+ struct args_in {
+ __u32 type;
+ } in;
+ struct args_out {
+ __u32 devid;
+ __u32 magic;
+ } out;
+ };
+};
+
+/*
+ * All the ioctls use this structure.
+ * When sending a path size must account for the total length
+ * of the chunk of memory otherwise is is the size of the
+ * structure.
+ */
+
+struct autofs_dev_ioctl {
+ __u32 ver_major;
+ __u32 ver_minor;
+ __u32 size; /* total size of data passed in
+ * including this struct */
+ __s32 ioctlfd; /* automount command fd */
+
+ /* Command parameters */
+
+ union {
+ struct args_protover protover;
+ struct args_protosubver protosubver;
+ struct args_openmount openmount;
+ struct args_ready ready;
+ struct args_fail fail;
+ struct args_setpipefd setpipefd;
+ struct args_timeout timeout;
+ struct args_requester requester;
+ struct args_expire expire;
+ struct args_askumount askumount;
+ struct args_ismountpoint ismountpoint;
+ };
+
+ char path[0];
+};
+
+static inline void init_autofs_dev_ioctl(struct autofs_dev_ioctl *in)
+{
+ memset(in, 0, sizeof(struct autofs_dev_ioctl));
+ in->ver_major = AUTOFS_DEV_IOCTL_VERSION_MAJOR;
+ in->ver_minor = AUTOFS_DEV_IOCTL_VERSION_MINOR;
+ in->size = sizeof(struct autofs_dev_ioctl);
+ in->ioctlfd = -1;
+}
+
+/*
+ * If you change this make sure you make the corresponding change
+ * to autofs-dev-ioctl.c:lookup_ioctl()
+ */
+enum {
+ /* Get various version info */
+ AUTOFS_DEV_IOCTL_VERSION_CMD = 0x71,
+ AUTOFS_DEV_IOCTL_PROTOVER_CMD,
+ AUTOFS_DEV_IOCTL_PROTOSUBVER_CMD,
+
+ /* Open mount ioctl fd */
+ AUTOFS_DEV_IOCTL_OPENMOUNT_CMD,
+
+ /* Close mount ioctl fd */
+ AUTOFS_DEV_IOCTL_CLOSEMOUNT_CMD,
+
+ /* Mount/expire status returns */
+ AUTOFS_DEV_IOCTL_READY_CMD,
+ AUTOFS_DEV_IOCTL_FAIL_CMD,
+
+ /* Activate/deactivate autofs mount */
+ AUTOFS_DEV_IOCTL_SETPIPEFD_CMD,
+ AUTOFS_DEV_IOCTL_CATATONIC_CMD,
+
+ /* Expiry timeout */
+ AUTOFS_DEV_IOCTL_TIMEOUT_CMD,
+
+ /* Get mount last requesting uid and gid */
+ AUTOFS_DEV_IOCTL_REQUESTER_CMD,
+
+ /* Check for eligible expire candidates */
+ AUTOFS_DEV_IOCTL_EXPIRE_CMD,
+
+ /* Request busy status */
+ AUTOFS_DEV_IOCTL_ASKUMOUNT_CMD,
+
+ /* Check if path is a mountpoint */
+ AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD,
+};
+
+#define AUTOFS_IOCTL 0x93
+
+#define AUTOFS_DEV_IOCTL_VERSION \
+ _IOWR(AUTOFS_IOCTL, \
+ AUTOFS_DEV_IOCTL_VERSION_CMD, struct autofs_dev_ioctl)
+
+#define AUTOFS_DEV_IOCTL_PROTOVER \
+ _IOWR(AUTOFS_IOCTL, \
+ AUTOFS_DEV_IOCTL_PROTOVER_CMD, struct autofs_dev_ioctl)
+
+#define AUTOFS_DEV_IOCTL_PROTOSUBVER \
+ _IOWR(AUTOFS_IOCTL, \
+ AUTOFS_DEV_IOCTL_PROTOSUBVER_CMD, struct autofs_dev_ioctl)
+
+#define AUTOFS_DEV_IOCTL_OPENMOUNT \
+ _IOWR(AUTOFS_IOCTL, \
+ AUTOFS_DEV_IOCTL_OPENMOUNT_CMD, struct autofs_dev_ioctl)
+
+#define AUTOFS_DEV_IOCTL_CLOSEMOUNT \
+ _IOWR(AUTOFS_IOCTL, \
+ AUTOFS_DEV_IOCTL_CLOSEMOUNT_CMD, struct autofs_dev_ioctl)
+
+#define AUTOFS_DEV_IOCTL_READY \
+ _IOWR(AUTOFS_IOCTL, \
+ AUTOFS_DEV_IOCTL_READY_CMD, struct autofs_dev_ioctl)
+
+#define AUTOFS_DEV_IOCTL_FAIL \
+ _IOWR(AUTOFS_IOCTL, \
+ AUTOFS_DEV_IOCTL_FAIL_CMD, struct autofs_dev_ioctl)
+
+#define AUTOFS_DEV_IOCTL_SETPIPEFD \
+ _IOWR(AUTOFS_IOCTL, \
+ AUTOFS_DEV_IOCTL_SETPIPEFD_CMD, struct autofs_dev_ioctl)
+
+#define AUTOFS_DEV_IOCTL_CATATONIC \
+ _IOWR(AUTOFS_IOCTL, \
+ AUTOFS_DEV_IOCTL_CATATONIC_CMD, struct autofs_dev_ioctl)
+
+#define AUTOFS_DEV_IOCTL_TIMEOUT \
+ _IOWR(AUTOFS_IOCTL, \
+ AUTOFS_DEV_IOCTL_TIMEOUT_CMD, struct autofs_dev_ioctl)
+
+#define AUTOFS_DEV_IOCTL_REQUESTER \
+ _IOWR(AUTOFS_IOCTL, \
+ AUTOFS_DEV_IOCTL_REQUESTER_CMD, struct autofs_dev_ioctl)
+
+#define AUTOFS_DEV_IOCTL_EXPIRE \
+ _IOWR(AUTOFS_IOCTL, \
+ AUTOFS_DEV_IOCTL_EXPIRE_CMD, struct autofs_dev_ioctl)
+
+#define AUTOFS_DEV_IOCTL_ASKUMOUNT \
+ _IOWR(AUTOFS_IOCTL, \
+ AUTOFS_DEV_IOCTL_ASKUMOUNT_CMD, struct autofs_dev_ioctl)
+
+#define AUTOFS_DEV_IOCTL_ISMOUNTPOINT \
+ _IOWR(AUTOFS_IOCTL, \
+ AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD, struct autofs_dev_ioctl)
+
+#endif /* _UAPI_LINUX_AUTO_DEV_IOCTL_H */
#define _UAPI_LINUX_AUTO_FS_H
#include <linux/types.h>
+#include <linux/limits.h>
#ifndef __KERNEL__
#include <sys/ioctl.h>
#endif /* __KERNEL__ */
#include <linux/pid_namespace.h>
#include <linux/device.h>
#include <linux/kthread.h>
+#include <linux/pagemap.h>
#include <linux/sched.h>
#include <linux/signal.h>
#include <linux/idr.h>
*/
static void __init mm_init(void)
{
+ /* Does address_space.flags still fit into a 32-bit ulong? */
+ BUILD_BUG_ON(AS_LAST_FLAG > 32);
+
/*
* page_ext requires contiguous pages,
* bigger than MAX_ORDER unless SPARSEMEM.
long r_msgtype;
long r_maxsize;
- /*
- * Mark r_msg volatile so that the compiler
- * does not try to get smart and optimize
- * it. We rely on this for the lockless
- * receive algorithm.
- */
- struct msg_msg *volatile r_msg;
+ struct msg_msg *r_msg;
};
/* one msg_sender for each sleeping sender */
struct msg_sender {
struct list_head list;
struct task_struct *tsk;
+ size_t msgsz;
};
#define SEARCH_ANY 1
return msq->q_perm.id;
}
-static inline void ss_add(struct msg_queue *msq, struct msg_sender *mss)
+static inline bool msg_fits_inqueue(struct msg_queue *msq, size_t msgsz)
+{
+ return msgsz + msq->q_cbytes <= msq->q_qbytes &&
+ 1 + msq->q_qnum <= msq->q_qbytes;
+}
+
+static inline void ss_add(struct msg_queue *msq,
+ struct msg_sender *mss, size_t msgsz)
{
mss->tsk = current;
+ mss->msgsz = msgsz;
__set_current_state(TASK_INTERRUPTIBLE);
list_add_tail(&mss->list, &msq->q_senders);
}
static inline void ss_del(struct msg_sender *mss)
{
- if (mss->list.next != NULL)
+ if (mss->list.next)
list_del(&mss->list);
}
-static void ss_wakeup(struct list_head *h, int kill)
+static void ss_wakeup(struct msg_queue *msq,
+ struct wake_q_head *wake_q, bool kill)
{
struct msg_sender *mss, *t;
+ struct task_struct *stop_tsk = NULL;
+ struct list_head *h = &msq->q_senders;
list_for_each_entry_safe(mss, t, h, list) {
if (kill)
mss->list.next = NULL;
- wake_up_process(mss->tsk);
+
+ /*
+ * Stop at the first task we don't wakeup,
+ * we've already iterated the original
+ * sender queue.
+ */
+ else if (stop_tsk == mss->tsk)
+ break;
+ /*
+ * We are not in an EIDRM scenario here, therefore
+ * verify that we really need to wakeup the task.
+ * To maintain current semantics and wakeup order,
+ * move the sender to the tail on behalf of the
+ * blocked task.
+ */
+ else if (!msg_fits_inqueue(msq, mss->msgsz)) {
+ if (!stop_tsk)
+ stop_tsk = mss->tsk;
+
+ list_move_tail(&mss->list, &msq->q_senders);
+ continue;
+ }
+
+ wake_q_add(wake_q, mss->tsk);
}
}
-static void expunge_all(struct msg_queue *msq, int res)
+static void expunge_all(struct msg_queue *msq, int res,
+ struct wake_q_head *wake_q)
{
struct msg_receiver *msr, *t;
list_for_each_entry_safe(msr, t, &msq->q_receivers, r_list) {
- msr->r_msg = NULL; /* initialize expunge ordering */
- wake_up_process(msr->r_tsk);
- /*
- * Ensure that the wakeup is visible before setting r_msg as
- * the receiving end depends on it: either spinning on a nil,
- * or dealing with -EAGAIN cases. See lockless receive part 1
- * and 2 in do_msgrcv().
- */
- smp_wmb(); /* barrier (B) */
- msr->r_msg = ERR_PTR(res);
+ wake_q_add(wake_q, msr->r_tsk);
+ WRITE_ONCE(msr->r_msg, ERR_PTR(res));
}
}
{
struct msg_msg *msg, *t;
struct msg_queue *msq = container_of(ipcp, struct msg_queue, q_perm);
+ WAKE_Q(wake_q);
- expunge_all(msq, -EIDRM);
- ss_wakeup(&msq->q_senders, 1);
+ expunge_all(msq, -EIDRM, &wake_q);
+ ss_wakeup(msq, &wake_q, true);
msg_rmid(ns, msq);
ipc_unlock_object(&msq->q_perm);
+ wake_up_q(&wake_q);
rcu_read_unlock();
list_for_each_entry_safe(msg, t, &msq->q_messages, m_list) {
freeque(ns, ipcp);
goto out_up;
case IPC_SET:
+ {
+ WAKE_Q(wake_q);
+
if (msqid64.msg_qbytes > ns->msg_ctlmnb &&
!capable(CAP_SYS_RESOURCE)) {
err = -EPERM;
msq->q_qbytes = msqid64.msg_qbytes;
msq->q_ctime = get_seconds();
- /* sleeping receivers might be excluded by
+ /*
+ * Sleeping receivers might be excluded by
* stricter permissions.
*/
- expunge_all(msq, -EAGAIN);
- /* sleeping senders might be able to send
+ expunge_all(msq, -EAGAIN, &wake_q);
+ /*
+ * Sleeping senders might be able to send
* due to a larger queue size.
*/
- ss_wakeup(&msq->q_senders, 0);
- break;
+ ss_wakeup(msq, &wake_q, false);
+ ipc_unlock_object(&msq->q_perm);
+ wake_up_q(&wake_q);
+
+ goto out_unlock1;
+ }
default:
err = -EINVAL;
goto out_unlock1;
return 0;
}
-static inline int pipelined_send(struct msg_queue *msq, struct msg_msg *msg)
+static inline int pipelined_send(struct msg_queue *msq, struct msg_msg *msg,
+ struct wake_q_head *wake_q)
{
struct msg_receiver *msr, *t;
list_del(&msr->r_list);
if (msr->r_maxsize < msg->m_ts) {
- /* initialize pipelined send ordering */
- msr->r_msg = NULL;
- wake_up_process(msr->r_tsk);
- /* barrier (B) see barrier comment below */
- smp_wmb();
- msr->r_msg = ERR_PTR(-E2BIG);
+ wake_q_add(wake_q, msr->r_tsk);
+ WRITE_ONCE(msr->r_msg, ERR_PTR(-E2BIG));
} else {
- msr->r_msg = NULL;
msq->q_lrpid = task_pid_vnr(msr->r_tsk);
msq->q_rtime = get_seconds();
- wake_up_process(msr->r_tsk);
- /*
- * Ensure that the wakeup is visible before
- * setting r_msg, as the receiving can otherwise
- * exit - once r_msg is set, the receiver can
- * continue. See lockless receive part 1 and 2
- * in do_msgrcv(). Barrier (B).
- */
- smp_wmb();
- msr->r_msg = msg;
+ wake_q_add(wake_q, msr->r_tsk);
+ WRITE_ONCE(msr->r_msg, msg);
return 1;
}
}
struct msg_msg *msg;
int err;
struct ipc_namespace *ns;
+ WAKE_Q(wake_q);
ns = current->nsproxy->ipc_ns;
goto out_unlock1;
}
- ipc_lock_object(&msq->q_perm);
-
for (;;) {
struct msg_sender s;
err = -EACCES;
if (ipcperms(ns, &msq->q_perm, S_IWUGO))
- goto out_unlock0;
+ goto out_unlock1;
+
+ ipc_lock_object(&msq->q_perm);
/* raced with RMID? */
if (!ipc_valid_object(&msq->q_perm)) {
if (err)
goto out_unlock0;
- if (msgsz + msq->q_cbytes <= msq->q_qbytes &&
- 1 + msq->q_qnum <= msq->q_qbytes) {
+ if (msg_fits_inqueue(msq, msgsz))
break;
- }
/* queue full, wait: */
if (msgflg & IPC_NOWAIT) {
}
/* enqueue the sender and prepare to block */
- ss_add(msq, &s);
+ ss_add(msq, &s, msgsz);
if (!ipc_rcu_getref(msq)) {
err = -EIDRM;
err = -EIDRM;
goto out_unlock0;
}
-
ss_del(&s);
if (signal_pending(current)) {
goto out_unlock0;
}
+ ipc_unlock_object(&msq->q_perm);
}
+
msq->q_lspid = task_tgid_vnr(current);
msq->q_stime = get_seconds();
- if (!pipelined_send(msq, msg)) {
+ if (!pipelined_send(msq, msg, &wake_q)) {
/* no one is waiting for this message, enqueue it */
list_add_tail(&msg->m_list, &msq->q_messages);
msq->q_cbytes += msgsz;
out_unlock0:
ipc_unlock_object(&msq->q_perm);
+ wake_up_q(&wake_q);
out_unlock1:
rcu_read_unlock();
if (msg != NULL)
struct msg_queue *msq;
struct ipc_namespace *ns;
struct msg_msg *msg, *copy = NULL;
+ WAKE_Q(wake_q);
ns = current->nsproxy->ipc_ns;
msq->q_cbytes -= msg->m_ts;
atomic_sub(msg->m_ts, &ns->msg_bytes);
atomic_dec(&ns->msg_hdrs);
- ss_wakeup(&msq->q_senders, 0);
+ ss_wakeup(msq, &wake_q, false);
goto out_unlock0;
}
rcu_read_unlock();
schedule();
- /* Lockless receive, part 1:
- * Disable preemption. We don't hold a reference to the queue
- * and getting a reference would defeat the idea of a lockless
- * operation, thus the code relies on rcu to guarantee the
- * existence of msq:
+ /*
+ * Lockless receive, part 1:
+ * We don't hold a reference to the queue and getting a
+ * reference would defeat the idea of a lockless operation,
+ * thus the code relies on rcu to guarantee the existence of
+ * msq:
* Prior to destruction, expunge_all(-EIRDM) changes r_msg.
* Thus if r_msg is -EAGAIN, then the queue not yet destroyed.
- * rcu_read_lock() prevents preemption between reading r_msg
- * and acquiring the q_perm.lock in ipc_lock_object().
*/
rcu_read_lock();
- /* Lockless receive, part 2:
- * Wait until pipelined_send or expunge_all are outside of
- * wake_up_process(). There is a race with exit(), see
- * ipc/mqueue.c for the details. The correct serialization
- * ensures that a receiver cannot continue without the wakeup
- * being visibible _before_ setting r_msg:
- *
- * CPU 0 CPU 1
- * <loop receiver>
- * smp_rmb(); (A) <-- pair -. <waker thread>
- * <load ->r_msg> | msr->r_msg = NULL;
- * | wake_up_process();
- * <continue> `------> smp_wmb(); (B)
- * msr->r_msg = msg;
+ /*
+ * Lockless receive, part 2:
+ * The work in pipelined_send() and expunge_all():
+ * - Set pointer to message
+ * - Queue the receiver task for later wakeup
+ * - Wake up the process after the lock is dropped.
*
- * Where (A) orders the message value read and where (B) orders
- * the write to the r_msg -- done in both pipelined_send and
- * expunge_all.
- */
- for (;;) {
- /*
- * Pairs with writer barrier in pipelined_send
- * or expunge_all.
- */
- smp_rmb(); /* barrier (A) */
- msg = (struct msg_msg *)msr_d.r_msg;
- if (msg)
- break;
-
- /*
- * The cpu_relax() call is a compiler barrier
- * which forces everything in this loop to be
- * re-loaded.
- */
- cpu_relax();
- }
-
- /* Lockless receive, part 3:
- * If there is a message or an error then accept it without
- * locking.
+ * Should the process wake up before this wakeup (due to a
+ * signal) it will either see the message and continue ...
*/
+ msg = READ_ONCE(msr_d.r_msg);
if (msg != ERR_PTR(-EAGAIN))
goto out_unlock1;
- /* Lockless receive, part 3:
- * Acquire the queue spinlock.
- */
+ /*
+ * ... or see -EAGAIN, acquire the lock to check the message
+ * again.
+ */
ipc_lock_object(&msq->q_perm);
- /* Lockless receive, part 4:
- * Repeat test after acquiring the spinlock.
- */
- msg = (struct msg_msg *)msr_d.r_msg;
+ msg = msr_d.r_msg;
if (msg != ERR_PTR(-EAGAIN))
goto out_unlock0;
out_unlock0:
ipc_unlock_object(&msq->q_perm);
+ wake_up_q(&wake_q);
out_unlock1:
rcu_read_unlock();
if (IS_ERR(msg)) {
/*
* Locking:
+ * a) global sem_lock() for read/write
* sem_undo.id_next,
* sem_array.complex_count,
- * sem_array.pending{_alter,_cont},
- * sem_array.sem_undo: global sem_lock() for read/write
- * sem_undo.proc_next: only "current" is allowed to read/write that field.
+ * sem_array.complex_mode
+ * sem_array.pending{_alter,_const},
+ * sem_array.sem_undo
*
+ * b) global or semaphore sem_lock() for read/write:
* sem_array.sem_base[i].pending_{const,alter}:
- * global or semaphore sem_lock() for read/write
+ * sem_array.complex_mode (for read)
+ *
+ * c) special:
+ * sem_undo_list.list_proc:
+ * * undo_list->lock for write
+ * * rcu for read
*/
#define sc_semmsl sem_ctls[0]
}
/*
- * Wait until all currently ongoing simple ops have completed.
+ * Enter the mode suitable for non-simple operations:
* Caller must own sem_perm.lock.
- * New simple ops cannot start, because simple ops first check
- * that sem_perm.lock is free.
- * that a) sem_perm.lock is free and b) complex_count is 0.
*/
-static void sem_wait_array(struct sem_array *sma)
+static void complexmode_enter(struct sem_array *sma)
{
int i;
struct sem *sem;
- if (sma->complex_count) {
- /* The thread that increased sma->complex_count waited on
- * all sem->lock locks. Thus we don't need to wait again.
- */
+ if (sma->complex_mode) {
+ /* We are already in complex_mode. Nothing to do */
return;
}
+ /* We need a full barrier after seting complex_mode:
+ * The write to complex_mode must be visible
+ * before we read the first sem->lock spinlock state.
+ */
+ smp_store_mb(sma->complex_mode, true);
+
for (i = 0; i < sma->sem_nsems; i++) {
sem = sma->sem_base + i;
spin_unlock_wait(&sem->lock);
}
+ /*
+ * spin_unlock_wait() is not a memory barriers, it is only a
+ * control barrier. The code must pair with spin_unlock(&sem->lock),
+ * thus just the control barrier is insufficient.
+ *
+ * smp_rmb() is sufficient, as writes cannot pass the control barrier.
+ */
+ smp_rmb();
+}
+
+/*
+ * Try to leave the mode that disallows simple operations:
+ * Caller must own sem_perm.lock.
+ */
+static void complexmode_tryleave(struct sem_array *sma)
+{
+ if (sma->complex_count) {
+ /* Complex ops are sleeping.
+ * We must stay in complex mode
+ */
+ return;
+ }
+ /*
+ * Immediately after setting complex_mode to false,
+ * a simple op can start. Thus: all memory writes
+ * performed by the current operation must be visible
+ * before we set complex_mode to false.
+ */
+ smp_store_release(&sma->complex_mode, false);
}
+#define SEM_GLOBAL_LOCK (-1)
/*
* If the request contains only one semaphore operation, and there are
* no complex transactions pending, lock only the semaphore involved.
/* Complex operation - acquire a full lock */
ipc_lock_object(&sma->sem_perm);
- /* And wait until all simple ops that are processed
- * right now have dropped their locks.
- */
- sem_wait_array(sma);
- return -1;
+ /* Prevent parallel simple ops */
+ complexmode_enter(sma);
+ return SEM_GLOBAL_LOCK;
}
/*
* Only one semaphore affected - try to optimize locking.
- * The rules are:
- * - optimized locking is possible if no complex operation
- * is either enqueued or processed right now.
- * - The test for enqueued complex ops is simple:
- * sma->complex_count != 0
- * - Testing for complex ops that are processed right now is
- * a bit more difficult. Complex ops acquire the full lock
- * and first wait that the running simple ops have completed.
- * (see above)
- * Thus: If we own a simple lock and the global lock is free
- * and complex_count is now 0, then it will stay 0 and
- * thus just locking sem->lock is sufficient.
+ * Optimized locking is possible if no complex operation
+ * is either enqueued or processed right now.
+ *
+ * Both facts are tracked by complex_mode.
*/
sem = sma->sem_base + sops->sem_num;
- if (sma->complex_count == 0) {
+ /*
+ * Initial check for complex_mode. Just an optimization,
+ * no locking, no memory barrier.
+ */
+ if (!sma->complex_mode) {
/*
* It appears that no complex operation is around.
* Acquire the per-semaphore lock.
*/
spin_lock(&sem->lock);
- /* Then check that the global lock is free */
- if (!spin_is_locked(&sma->sem_perm.lock)) {
- /*
- * We need a memory barrier with acquire semantics,
- * otherwise we can race with another thread that does:
- * complex_count++;
- * spin_unlock(sem_perm.lock);
- */
- smp_acquire__after_ctrl_dep();
+ /*
+ * See 51d7d5205d33
+ * ("powerpc: Add smp_mb() to arch_spin_is_locked()"):
+ * A full barrier is required: the write of sem->lock
+ * must be visible before the read is executed
+ */
+ smp_mb();
- /*
- * Now repeat the test of complex_count:
- * It can't change anymore until we drop sem->lock.
- * Thus: if is now 0, then it will stay 0.
- */
- if (sma->complex_count == 0) {
- /* fast path successful! */
- return sops->sem_num;
- }
+ if (!smp_load_acquire(&sma->complex_mode)) {
+ /* fast path successful! */
+ return sops->sem_num;
}
spin_unlock(&sem->lock);
}
/* Not a false alarm, thus complete the sequence for a
* full lock.
*/
- sem_wait_array(sma);
- return -1;
+ complexmode_enter(sma);
+ return SEM_GLOBAL_LOCK;
}
}
static inline void sem_unlock(struct sem_array *sma, int locknum)
{
- if (locknum == -1) {
+ if (locknum == SEM_GLOBAL_LOCK) {
unmerge_queues(sma);
+ complexmode_tryleave(sma);
ipc_unlock_object(&sma->sem_perm);
} else {
struct sem *sem = sma->sem_base + locknum;
}
sma->complex_count = 0;
+ sma->complex_mode = true; /* dropped by sem_unlock below */
INIT_LIST_HEAD(&sma->pending_alter);
INIT_LIST_HEAD(&sma->pending_const);
INIT_LIST_HEAD(&sma->list_id);
/*
* The proc interface isn't aware of sem_lock(), it calls
* ipc_lock_object() directly (in sysvipc_find_ipc).
- * In order to stay compatible with sem_lock(), we must wait until
- * all simple semop() calls have left their critical regions.
+ * In order to stay compatible with sem_lock(), we must
+ * enter / leave complex_mode.
*/
- sem_wait_array(sma);
+ complexmode_enter(sma);
sem_otime = get_semotime(sma);
sem_otime,
sma->sem_ctime);
+ complexmode_tryleave(sma);
+
return 0;
}
#endif
CONFIG_ARMV8_DEPRECATED=y
CONFIG_ASHMEM=y
CONFIG_AUDIT=y
-CONFIG_BLK_DEV_DM=y
CONFIG_BLK_DEV_INITRD=y
CONFIG_CGROUPS=y
CONFIG_CGROUP_CPUACCT=y
CONFIG_CGROUP_FREEZER=y
CONFIG_CGROUP_SCHED=y
CONFIG_CP15_BARRIER_EMULATION=y
-CONFIG_DM_CRYPT=y
-CONFIG_DM_VERITY=y
-CONFIG_DM_VERITY_FEC=y
+CONFIG_DEFAULT_SECURITY_SELINUX=y
CONFIG_EMBEDDED=y
CONFIG_FB=y
CONFIG_HIGH_RES_TIMERS=y
CONFIG_IPV6_MIP6=y
CONFIG_IPV6_MULTIPLE_TABLES=y
CONFIG_IPV6_OPTIMISTIC_DAD=y
-CONFIG_IPV6_PRIVACY=y
CONFIG_IPV6_ROUTER_PREF=y
CONFIG_IPV6_ROUTE_INFO=y
CONFIG_IP_ADVANCED_ROUTER=y
CONFIG_QUOTA=y
CONFIG_RTC_CLASS=y
CONFIG_RT_GROUP_SCHED=y
+CONFIG_SECCOMP=y
CONFIG_SECURITY=y
CONFIG_SECURITY_NETWORK=y
CONFIG_SECURITY_SELINUX=y
# CONFIG_PM_WAKELOCKS_GC is not set
# CONFIG_VT is not set
CONFIG_BACKLIGHT_LCD_SUPPORT=y
+CONFIG_BLK_DEV_DM=y
CONFIG_BLK_DEV_LOOP=y
CONFIG_BLK_DEV_RAM=y
CONFIG_BLK_DEV_RAM_SIZE=8192
CONFIG_COMPACTION=y
CONFIG_DEBUG_RODATA=y
+CONFIG_DM_CRYPT=y
CONFIG_DM_UEVENT=y
+CONFIG_DM_VERITY=y
+CONFIG_DM_VERITY_FEC=y
CONFIG_DRAGONRISE_FF=y
CONFIG_ENABLE_DEFAULT_TRACERS=y
CONFIG_EXT4_FS=y
mm_update_next_owner(mm);
mmput(mm);
if (test_thread_flag(TIF_MEMDIE))
- exit_oom_victim(tsk);
+ exit_oom_victim();
}
static struct task_struct *find_alive_thread(struct task_struct *p)
{
taskstats_tgid_free(sig);
sched_autogroup_exit(sig);
+ /*
+ * __mmdrop is not safe to call from softirq context on x86 due to
+ * pgd_dtor so postpone it to the async context
+ */
+ if (sig->oom_mm)
+ mmdrop_async(sig->oom_mm);
kmem_cache_free(signal_cachep, sig);
}
ksm_exit(mm);
khugepaged_exit(mm); /* must run before exit_mmap */
exit_mmap(mm);
+ mm_put_huge_zero_page(mm);
set_mm_exe_file(mm, NULL);
if (!list_empty(&mm->mmlist)) {
spin_lock(&mmlist_lock);
}
if (mm->binfmt)
module_put(mm->binfmt->module);
+ set_bit(MMF_OOM_SKIP, &mm->flags);
mmdrop(mm);
}
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/user_namespace.h>
+#include <linux/vmalloc.h>
#include <asm/uaccess.h>
struct group_info *groups_alloc(int gidsetsize)
{
- struct group_info *group_info;
- int nblocks;
- int i;
-
- nblocks = (gidsetsize + NGROUPS_PER_BLOCK - 1) / NGROUPS_PER_BLOCK;
- /* Make sure we always allocate at least one indirect block pointer */
- nblocks = nblocks ? : 1;
- group_info = kmalloc(sizeof(*group_info) + nblocks*sizeof(gid_t *), GFP_USER);
- if (!group_info)
+ struct group_info *gi;
+ unsigned int len;
+
+ len = sizeof(struct group_info) + sizeof(kgid_t) * gidsetsize;
+ gi = kmalloc(len, GFP_KERNEL_ACCOUNT|__GFP_NOWARN|__GFP_NORETRY);
+ if (!gi)
+ gi = __vmalloc(len, GFP_KERNEL_ACCOUNT|__GFP_HIGHMEM, PAGE_KERNEL);
+ if (!gi)
return NULL;
- group_info->ngroups = gidsetsize;
- group_info->nblocks = nblocks;
- atomic_set(&group_info->usage, 1);
-
- if (gidsetsize <= NGROUPS_SMALL)
- group_info->blocks[0] = group_info->small_block;
- else {
- for (i = 0; i < nblocks; i++) {
- kgid_t *b;
- b = (void *)__get_free_page(GFP_USER);
- if (!b)
- goto out_undo_partial_alloc;
- group_info->blocks[i] = b;
- }
- }
- return group_info;
-out_undo_partial_alloc:
- while (--i >= 0) {
- free_page((unsigned long)group_info->blocks[i]);
- }
- kfree(group_info);
- return NULL;
+ atomic_set(&gi->usage, 1);
+ gi->ngroups = gidsetsize;
+ return gi;
}
EXPORT_SYMBOL(groups_alloc);
void groups_free(struct group_info *group_info)
{
- if (group_info->blocks[0] != group_info->small_block) {
- int i;
- for (i = 0; i < group_info->nblocks; i++)
- free_page((unsigned long)group_info->blocks[i]);
- }
- kfree(group_info);
+ kvfree(group_info);
}
EXPORT_SYMBOL(groups_free);
for (i = 0; i < count; i++) {
gid_t gid;
- gid = from_kgid_munged(user_ns, GROUP_AT(group_info, i));
+ gid = from_kgid_munged(user_ns, group_info->gid[i]);
if (put_user(gid, grouplist+i))
return -EFAULT;
}
if (!gid_valid(kgid))
return -EINVAL;
- GROUP_AT(group_info, i) = kgid;
+ group_info->gid[i] = kgid;
}
return 0;
}
for (base = 0; base < max; base++) {
int left = base;
int right = left + stride;
- kgid_t tmp = GROUP_AT(group_info, right);
+ kgid_t tmp = group_info->gid[right];
- while (left >= 0 && gid_gt(GROUP_AT(group_info, left), tmp)) {
- GROUP_AT(group_info, right) =
- GROUP_AT(group_info, left);
+ while (left >= 0 && gid_gt(group_info->gid[left], tmp)) {
+ group_info->gid[right] = group_info->gid[left];
right = left;
left -= stride;
}
- GROUP_AT(group_info, right) = tmp;
+ group_info->gid[right] = tmp;
}
stride /= 3;
}
right = group_info->ngroups;
while (left < right) {
unsigned int mid = (left+right)/2;
- if (gid_gt(grp, GROUP_AT(group_info, mid)))
+ if (gid_gt(grp, group_info->gid[mid]))
left = mid + 1;
- else if (gid_lt(grp, GROUP_AT(group_info, mid)))
+ else if (gid_lt(grp, group_info->gid[mid]))
right = mid;
else
return 1;
panic_smp_self_stop();
}
+/*
+ * Stop other CPUs in panic. Architecture dependent code may override this
+ * with more suitable version. For example, if the architecture supports
+ * crash dump, it should save registers of each stopped CPU and disable
+ * per-CPU features such as virtualization extensions.
+ */
+void __weak crash_smp_send_stop(void)
+{
+ static int cpus_stopped;
+
+ /*
+ * This function can be called twice in panic path, but obviously
+ * we execute this only once.
+ */
+ if (cpus_stopped)
+ return;
+
+ /*
+ * Note smp_send_stop is the usual smp shutdown function, which
+ * unfortunately means it may not be hardened to work in a panic
+ * situation.
+ */
+ smp_send_stop();
+ cpus_stopped = 1;
+}
+
atomic_t panic_cpu = ATOMIC_INIT(PANIC_CPU_INVALID);
/*
if (!_crash_kexec_post_notifiers) {
printk_nmi_flush_on_panic();
__crash_kexec(NULL);
- }
- /*
- * Note smp_send_stop is the usual smp shutdown function, which
- * unfortunately means it may not be hardened to work in a panic
- * situation.
- */
- smp_send_stop();
+ /*
+ * Note smp_send_stop is the usual smp shutdown function, which
+ * unfortunately means it may not be hardened to work in a
+ * panic situation.
+ */
+ smp_send_stop();
+ } else {
+ /*
+ * If we want to do crash dump after notifier calls and
+ * kmsg_dump, we will need architecture dependent extra
+ * works in addition to stopping other CPUs.
+ */
+ crash_smp_send_stop();
+ }
/*
* Run any panic handlers, including those that might need to
/*
* Now that the whole userspace is frozen we need to disbale
* the OOM killer to disallow any further interference with
- * killable tasks.
+ * killable tasks. There is no guarantee oom victims will
+ * ever reach a point they go away we have to wait with a timeout.
*/
- if (!error && !oom_killer_disable())
+ if (!error && !oom_killer_disable(msecs_to_jiffies(freeze_timeout_msecs)))
error = -EBUSY;
- /*
- * There is a hard to fix race between oom_reaper kernel thread
- * and oom_killer_disable. oom_reaper calls exit_oom_victim
- * before the victim reaches exit_mm so try to freeze all the tasks
- * again and catch such a left over task.
- */
- if (!error) {
- pr_info("Double checking all user space processes after OOM killer disable... ");
- error = try_to_freeze_tasks(true);
- pr_cont("\n");
- }
-
if (error)
thaw_processes();
return error;
int console_set_on_cmdline;
EXPORT_SYMBOL(console_set_on_cmdline);
+#ifdef CONFIG_OF
+static bool of_specified_console;
+
+void console_set_by_of(void)
+{
+ of_specified_console = true;
+}
+#else
+# define of_specified_console false
+#endif
+
/* Flag: console code may call schedule() */
static int console_may_schedule;
* didn't select a console we take the first one
* that registers here.
*/
- if (preferred_console < 0) {
+ if (preferred_console < 0 && !of_specified_console) {
if (newcon->index < 0)
newcon->index = 0;
if (newcon->setup == NULL ||
{
BUG_ON(!child->ptrace);
+ clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
+
child->parent = child->real_parent;
list_del_init(&child->ptrace_entry);
/* Architecture-specific hardware disable .. */
ptrace_disable(child);
- clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
write_lock_irq(&tasklist_lock);
/*
/**
* wakeup_readers - wake up readers waiting on a channel
- * @data: contains the channel buffer
+ * @work: contains the channel buffer
*
- * This is the timer function used to defer reader waking.
+ * This is the function used to defer reader waking
*/
-static void wakeup_readers(unsigned long data)
+static void wakeup_readers(struct irq_work *work)
{
- struct rchan_buf *buf = (struct rchan_buf *)data;
+ struct rchan_buf *buf;
+
+ buf = container_of(work, struct rchan_buf, wakeup_work);
wake_up_interruptible(&buf->read_wait);
}
if (init) {
init_waitqueue_head(&buf->read_wait);
kref_init(&buf->kref);
- setup_timer(&buf->timer, wakeup_readers, (unsigned long)buf);
- } else
- del_timer_sync(&buf->timer);
+ init_irq_work(&buf->wakeup_work, wakeup_readers);
+ } else {
+ irq_work_sync(&buf->wakeup_work);
+ }
buf->subbufs_produced = 0;
buf->subbufs_consumed = 0;
static void relay_close_buf(struct rchan_buf *buf)
{
buf->finalized = 1;
- del_timer_sync(&buf->timer);
+ irq_work_sync(&buf->wakeup_work);
buf->chan->cb->remove_buf_file(buf->dentry);
kref_put(&buf->kref, relay_remove_buf);
}
buf->early_bytes += buf->chan->subbuf_size -
buf->padding[old_subbuf];
smp_mb();
- if (waitqueue_active(&buf->read_wait))
+ if (waitqueue_active(&buf->read_wait)) {
/*
* Calling wake_up_interruptible() from here
* will deadlock if we happen to be logging
* from the scheduler (trying to re-grab
* rq->lock), so defer it.
*/
- mod_timer(&buf->timer, jiffies + 1);
+ irq_work_queue(&buf->wakeup_work);
+ }
}
old = buf->data;
#include "sched.h"
+/* Linker adds these: start and end of __cpuidle functions */
+extern char __cpuidle_text_start[], __cpuidle_text_end[];
+
/**
* sched_idle_set_state - Record idle state for the current CPU.
* @idle_state: State to record.
__setup("hlt", cpu_idle_nopoll_setup);
#endif
-static inline int cpu_idle_poll(void)
+static noinline int __cpuidle cpu_idle_poll(void)
{
rcu_idle_enter();
trace_cpu_idle_rcuidle(0, smp_processor_id());
*
* To use when the cpuidle framework cannot be used.
*/
-void default_idle_call(void)
+void __cpuidle default_idle_call(void)
{
if (current_clr_polling_and_test()) {
local_irq_enable();
}
}
+bool cpu_in_idle(unsigned long pc)
+{
+ return pc >= (unsigned long)__cpuidle_text_start &&
+ pc < (unsigned long)__cpuidle_text_end;
+}
+
void cpu_startup_entry(enum cpuhp_state state)
{
/*
extern int pid_max;
extern int pid_max_min, pid_max_max;
extern int percpu_pagelist_fraction;
-extern int compat_log;
extern int latencytop_enabled;
extern int sysctl_nr_open_min, sysctl_nr_open_max;
#ifndef CONFIG_MMU
.extra1 = &neg_one,
},
#endif
-#ifdef CONFIG_COMPAT
- {
- .procname = "compat-log",
- .data = &compat_log,
- .maxlen = sizeof (int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
-#endif
#ifdef CONFIG_RT_MUTEXES
{
.procname = "max_lock_depth",
kgid_t kgid;
for (i = 0; i < group_info->ngroups; i++) {
- kgid = GROUP_AT(group_info, i);
+ kgid = group_info->gid[i];
group = high2lowgid(from_kgid_munged(user_ns, kgid));
if (put_user(group, grouplist+i))
return -EFAULT;
if (!gid_valid(kgid))
return -EINVAL;
- GROUP_AT(group_info, i) = kgid;
+ group_info->gid[i] = kgid;
}
return 0;
*/
if (is_hardlockup()) {
int this_cpu = smp_processor_id();
- struct pt_regs *regs = get_irq_regs();
/* only print hardlockups once */
if (__this_cpu_read(hard_watchdog_warn) == true)
when they need to do cyclic redundancy check according CRC8
algorithm. Module will be called crc8.
+config CRC64_ECMA
+ tristate "CRC64 ECMA function"
+ help
+ This option provides CRC64 ECMA function. Drivers may select this
+ when they need to do cyclic redundancy check according to the CRC64
+ ECMA algorithm.
+
config AUDIT_GENERIC
bool
depends on AUDIT && !AUDIT_ARCH
obj-$(CONFIG_CRC7) += crc7.o
obj-$(CONFIG_LIBCRC32C) += libcrc32c.o
obj-$(CONFIG_CRC8) += crc8.o
+obj-$(CONFIG_CRC64_ECMA) += crc64_ecma.o
obj-$(CONFIG_GENERIC_ALLOCATOR) += genalloc.o
obj-$(CONFIG_842_COMPRESS) += 842/
--- /dev/null
+/*
+ * Copyright 2013 Freescale Semiconductor Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Freescale Semiconductor nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ *
+ * ALTERNATIVELY, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") as published by the Free Software
+ * Foundation, either version 2 of that License or (at your option) any
+ * later version.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/module.h>
+#include <linux/crc64_ecma.h>
+
+
+#define CRC64_BYTE_MASK 0xFF
+#define CRC64_TABLE_SIZE 256
+
+
+struct crc64_table {
+ u64 seed;
+ u64 table[CRC64_TABLE_SIZE];
+};
+
+
+static struct crc64_table CRC64_ECMA_182 = {
+ CRC64_DEFAULT_INITVAL,
+ {
+ 0x0000000000000000ULL,
+ 0xb32e4cbe03a75f6fULL,
+ 0xf4843657a840a05bULL,
+ 0x47aa7ae9abe7ff34ULL,
+ 0x7bd0c384ff8f5e33ULL,
+ 0xc8fe8f3afc28015cULL,
+ 0x8f54f5d357cffe68ULL,
+ 0x3c7ab96d5468a107ULL,
+ 0xf7a18709ff1ebc66ULL,
+ 0x448fcbb7fcb9e309ULL,
+ 0x0325b15e575e1c3dULL,
+ 0xb00bfde054f94352ULL,
+ 0x8c71448d0091e255ULL,
+ 0x3f5f08330336bd3aULL,
+ 0x78f572daa8d1420eULL,
+ 0xcbdb3e64ab761d61ULL,
+ 0x7d9ba13851336649ULL,
+ 0xceb5ed8652943926ULL,
+ 0x891f976ff973c612ULL,
+ 0x3a31dbd1fad4997dULL,
+ 0x064b62bcaebc387aULL,
+ 0xb5652e02ad1b6715ULL,
+ 0xf2cf54eb06fc9821ULL,
+ 0x41e11855055bc74eULL,
+ 0x8a3a2631ae2dda2fULL,
+ 0x39146a8fad8a8540ULL,
+ 0x7ebe1066066d7a74ULL,
+ 0xcd905cd805ca251bULL,
+ 0xf1eae5b551a2841cULL,
+ 0x42c4a90b5205db73ULL,
+ 0x056ed3e2f9e22447ULL,
+ 0xb6409f5cfa457b28ULL,
+ 0xfb374270a266cc92ULL,
+ 0x48190ecea1c193fdULL,
+ 0x0fb374270a266cc9ULL,
+ 0xbc9d3899098133a6ULL,
+ 0x80e781f45de992a1ULL,
+ 0x33c9cd4a5e4ecdceULL,
+ 0x7463b7a3f5a932faULL,
+ 0xc74dfb1df60e6d95ULL,
+ 0x0c96c5795d7870f4ULL,
+ 0xbfb889c75edf2f9bULL,
+ 0xf812f32ef538d0afULL,
+ 0x4b3cbf90f69f8fc0ULL,
+ 0x774606fda2f72ec7ULL,
+ 0xc4684a43a15071a8ULL,
+ 0x83c230aa0ab78e9cULL,
+ 0x30ec7c140910d1f3ULL,
+ 0x86ace348f355aadbULL,
+ 0x3582aff6f0f2f5b4ULL,
+ 0x7228d51f5b150a80ULL,
+ 0xc10699a158b255efULL,
+ 0xfd7c20cc0cdaf4e8ULL,
+ 0x4e526c720f7dab87ULL,
+ 0x09f8169ba49a54b3ULL,
+ 0xbad65a25a73d0bdcULL,
+ 0x710d64410c4b16bdULL,
+ 0xc22328ff0fec49d2ULL,
+ 0x85895216a40bb6e6ULL,
+ 0x36a71ea8a7ace989ULL,
+ 0x0adda7c5f3c4488eULL,
+ 0xb9f3eb7bf06317e1ULL,
+ 0xfe5991925b84e8d5ULL,
+ 0x4d77dd2c5823b7baULL,
+ 0x64b62bcaebc387a1ULL,
+ 0xd7986774e864d8ceULL,
+ 0x90321d9d438327faULL,
+ 0x231c512340247895ULL,
+ 0x1f66e84e144cd992ULL,
+ 0xac48a4f017eb86fdULL,
+ 0xebe2de19bc0c79c9ULL,
+ 0x58cc92a7bfab26a6ULL,
+ 0x9317acc314dd3bc7ULL,
+ 0x2039e07d177a64a8ULL,
+ 0x67939a94bc9d9b9cULL,
+ 0xd4bdd62abf3ac4f3ULL,
+ 0xe8c76f47eb5265f4ULL,
+ 0x5be923f9e8f53a9bULL,
+ 0x1c4359104312c5afULL,
+ 0xaf6d15ae40b59ac0ULL,
+ 0x192d8af2baf0e1e8ULL,
+ 0xaa03c64cb957be87ULL,
+ 0xeda9bca512b041b3ULL,
+ 0x5e87f01b11171edcULL,
+ 0x62fd4976457fbfdbULL,
+ 0xd1d305c846d8e0b4ULL,
+ 0x96797f21ed3f1f80ULL,
+ 0x2557339fee9840efULL,
+ 0xee8c0dfb45ee5d8eULL,
+ 0x5da24145464902e1ULL,
+ 0x1a083bacedaefdd5ULL,
+ 0xa9267712ee09a2baULL,
+ 0x955cce7fba6103bdULL,
+ 0x267282c1b9c65cd2ULL,
+ 0x61d8f8281221a3e6ULL,
+ 0xd2f6b4961186fc89ULL,
+ 0x9f8169ba49a54b33ULL,
+ 0x2caf25044a02145cULL,
+ 0x6b055fede1e5eb68ULL,
+ 0xd82b1353e242b407ULL,
+ 0xe451aa3eb62a1500ULL,
+ 0x577fe680b58d4a6fULL,
+ 0x10d59c691e6ab55bULL,
+ 0xa3fbd0d71dcdea34ULL,
+ 0x6820eeb3b6bbf755ULL,
+ 0xdb0ea20db51ca83aULL,
+ 0x9ca4d8e41efb570eULL,
+ 0x2f8a945a1d5c0861ULL,
+ 0x13f02d374934a966ULL,
+ 0xa0de61894a93f609ULL,
+ 0xe7741b60e174093dULL,
+ 0x545a57dee2d35652ULL,
+ 0xe21ac88218962d7aULL,
+ 0x5134843c1b317215ULL,
+ 0x169efed5b0d68d21ULL,
+ 0xa5b0b26bb371d24eULL,
+ 0x99ca0b06e7197349ULL,
+ 0x2ae447b8e4be2c26ULL,
+ 0x6d4e3d514f59d312ULL,
+ 0xde6071ef4cfe8c7dULL,
+ 0x15bb4f8be788911cULL,
+ 0xa6950335e42fce73ULL,
+ 0xe13f79dc4fc83147ULL,
+ 0x521135624c6f6e28ULL,
+ 0x6e6b8c0f1807cf2fULL,
+ 0xdd45c0b11ba09040ULL,
+ 0x9aefba58b0476f74ULL,
+ 0x29c1f6e6b3e0301bULL,
+ 0xc96c5795d7870f42ULL,
+ 0x7a421b2bd420502dULL,
+ 0x3de861c27fc7af19ULL,
+ 0x8ec62d7c7c60f076ULL,
+ 0xb2bc941128085171ULL,
+ 0x0192d8af2baf0e1eULL,
+ 0x4638a2468048f12aULL,
+ 0xf516eef883efae45ULL,
+ 0x3ecdd09c2899b324ULL,
+ 0x8de39c222b3eec4bULL,
+ 0xca49e6cb80d9137fULL,
+ 0x7967aa75837e4c10ULL,
+ 0x451d1318d716ed17ULL,
+ 0xf6335fa6d4b1b278ULL,
+ 0xb199254f7f564d4cULL,
+ 0x02b769f17cf11223ULL,
+ 0xb4f7f6ad86b4690bULL,
+ 0x07d9ba1385133664ULL,
+ 0x4073c0fa2ef4c950ULL,
+ 0xf35d8c442d53963fULL,
+ 0xcf273529793b3738ULL,
+ 0x7c0979977a9c6857ULL,
+ 0x3ba3037ed17b9763ULL,
+ 0x888d4fc0d2dcc80cULL,
+ 0x435671a479aad56dULL,
+ 0xf0783d1a7a0d8a02ULL,
+ 0xb7d247f3d1ea7536ULL,
+ 0x04fc0b4dd24d2a59ULL,
+ 0x3886b22086258b5eULL,
+ 0x8ba8fe9e8582d431ULL,
+ 0xcc0284772e652b05ULL,
+ 0x7f2cc8c92dc2746aULL,
+ 0x325b15e575e1c3d0ULL,
+ 0x8175595b76469cbfULL,
+ 0xc6df23b2dda1638bULL,
+ 0x75f16f0cde063ce4ULL,
+ 0x498bd6618a6e9de3ULL,
+ 0xfaa59adf89c9c28cULL,
+ 0xbd0fe036222e3db8ULL,
+ 0x0e21ac88218962d7ULL,
+ 0xc5fa92ec8aff7fb6ULL,
+ 0x76d4de52895820d9ULL,
+ 0x317ea4bb22bfdfedULL,
+ 0x8250e80521188082ULL,
+ 0xbe2a516875702185ULL,
+ 0x0d041dd676d77eeaULL,
+ 0x4aae673fdd3081deULL,
+ 0xf9802b81de97deb1ULL,
+ 0x4fc0b4dd24d2a599ULL,
+ 0xfceef8632775faf6ULL,
+ 0xbb44828a8c9205c2ULL,
+ 0x086ace348f355aadULL,
+ 0x34107759db5dfbaaULL,
+ 0x873e3be7d8faa4c5ULL,
+ 0xc094410e731d5bf1ULL,
+ 0x73ba0db070ba049eULL,
+ 0xb86133d4dbcc19ffULL,
+ 0x0b4f7f6ad86b4690ULL,
+ 0x4ce50583738cb9a4ULL,
+ 0xffcb493d702be6cbULL,
+ 0xc3b1f050244347ccULL,
+ 0x709fbcee27e418a3ULL,
+ 0x3735c6078c03e797ULL,
+ 0x841b8ab98fa4b8f8ULL,
+ 0xadda7c5f3c4488e3ULL,
+ 0x1ef430e13fe3d78cULL,
+ 0x595e4a08940428b8ULL,
+ 0xea7006b697a377d7ULL,
+ 0xd60abfdbc3cbd6d0ULL,
+ 0x6524f365c06c89bfULL,
+ 0x228e898c6b8b768bULL,
+ 0x91a0c532682c29e4ULL,
+ 0x5a7bfb56c35a3485ULL,
+ 0xe955b7e8c0fd6beaULL,
+ 0xaeffcd016b1a94deULL,
+ 0x1dd181bf68bdcbb1ULL,
+ 0x21ab38d23cd56ab6ULL,
+ 0x9285746c3f7235d9ULL,
+ 0xd52f0e859495caedULL,
+ 0x6601423b97329582ULL,
+ 0xd041dd676d77eeaaULL,
+ 0x636f91d96ed0b1c5ULL,
+ 0x24c5eb30c5374ef1ULL,
+ 0x97eba78ec690119eULL,
+ 0xab911ee392f8b099ULL,
+ 0x18bf525d915feff6ULL,
+ 0x5f1528b43ab810c2ULL,
+ 0xec3b640a391f4fadULL,
+ 0x27e05a6e926952ccULL,
+ 0x94ce16d091ce0da3ULL,
+ 0xd3646c393a29f297ULL,
+ 0x604a2087398eadf8ULL,
+ 0x5c3099ea6de60cffULL,
+ 0xef1ed5546e415390ULL,
+ 0xa8b4afbdc5a6aca4ULL,
+ 0x1b9ae303c601f3cbULL,
+ 0x56ed3e2f9e224471ULL,
+ 0xe5c372919d851b1eULL,
+ 0xa26908783662e42aULL,
+ 0x114744c635c5bb45ULL,
+ 0x2d3dfdab61ad1a42ULL,
+ 0x9e13b115620a452dULL,
+ 0xd9b9cbfcc9edba19ULL,
+ 0x6a978742ca4ae576ULL,
+ 0xa14cb926613cf817ULL,
+ 0x1262f598629ba778ULL,
+ 0x55c88f71c97c584cULL,
+ 0xe6e6c3cfcadb0723ULL,
+ 0xda9c7aa29eb3a624ULL,
+ 0x69b2361c9d14f94bULL,
+ 0x2e184cf536f3067fULL,
+ 0x9d36004b35545910ULL,
+ 0x2b769f17cf112238ULL,
+ 0x9858d3a9ccb67d57ULL,
+ 0xdff2a94067518263ULL,
+ 0x6cdce5fe64f6dd0cULL,
+ 0x50a65c93309e7c0bULL,
+ 0xe388102d33392364ULL,
+ 0xa4226ac498dedc50ULL,
+ 0x170c267a9b79833fULL,
+ 0xdcd7181e300f9e5eULL,
+ 0x6ff954a033a8c131ULL,
+ 0x28532e49984f3e05ULL,
+ 0x9b7d62f79be8616aULL,
+ 0xa707db9acf80c06dULL,
+ 0x14299724cc279f02ULL,
+ 0x5383edcd67c06036ULL,
+ 0xe0ada17364673f59ULL
+ }
+};
+
+
+/*
+ * crc64_ecma_seed - Initializes the CRC64 ECMA seed.
+ */
+u64 crc64_ecma_seed(void)
+{
+ return CRC64_ECMA_182.seed;
+}
+EXPORT_SYMBOL(crc64_ecma_seed);
+
+/*
+ * crc64_ecma - Computes the 64 bit ECMA CRC.
+ *
+ * pdata: pointer to the data to compute checksum for.
+ * nbytes: number of bytes in data buffer.
+ * seed: CRC seed.
+ */
+u64 crc64_ecma(u8 const *pdata, u32 nbytes, u64 seed)
+{
+ unsigned int i;
+ u64 crc = seed;
+
+ for (i = 0; i < nbytes; i++)
+ crc = CRC64_ECMA_182.table[(crc ^ pdata[i]) & CRC64_BYTE_MASK] ^
+ (crc >> 8);
+
+ return crc;
+}
+EXPORT_SYMBOL(crc64_ecma);
+
+MODULE_DESCRIPTION("CRC64 ECMA function");
+MODULE_AUTHOR("Freescale Semiconductor Inc.");
+MODULE_LICENSE("GPL");
{
unsigned long long res;
unsigned int rv;
- int overflow;
res = 0;
rv = 0;
- overflow = 0;
while (*s) {
unsigned int val;
*/
if (unlikely(res & (~0ull << 60))) {
if (res > div_u64(ULLONG_MAX - val, base))
- overflow = 1;
+ rv |= KSTRTOX_OVERFLOW;
}
res = res * base + val;
rv++;
s++;
}
*p = res;
- if (overflow)
- rv |= KSTRTOX_OVERFLOW;
return rv;
}
#include <linux/delay.h>
#include <linux/kprobes.h>
#include <linux/nmi.h>
+#include <linux/cpu.h>
-#ifdef arch_trigger_all_cpu_backtrace
+#ifdef arch_trigger_cpumask_backtrace
/* For reliability, we're prepared to waste bits here. */
static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
-/* "in progress" flag of arch_trigger_all_cpu_backtrace */
+/* "in progress" flag of arch_trigger_cpumask_backtrace */
static unsigned long backtrace_flag;
/*
- * When raise() is called it will be is passed a pointer to the
+ * When raise() is called it will be passed a pointer to the
* backtrace_mask. Architectures that call nmi_cpu_backtrace()
* directly from their raise() functions may rely on the mask
* they are passed being updated as a side effect of this call.
*/
-void nmi_trigger_all_cpu_backtrace(bool include_self,
+void nmi_trigger_cpumask_backtrace(const cpumask_t *mask,
+ bool exclude_self,
void (*raise)(cpumask_t *mask))
{
int i, this_cpu = get_cpu();
return;
}
- cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask);
- if (!include_self)
+ cpumask_copy(to_cpumask(backtrace_mask), mask);
+ if (exclude_self)
cpumask_clear_cpu(this_cpu, to_cpumask(backtrace_mask));
+ /*
+ * Don't try to send an NMI to this cpu; it may work on some
+ * architectures, but on others it may not, and we'll get
+ * information at least as useful just by doing a dump_stack() here.
+ * Note that nmi_cpu_backtrace(NULL) will clear the cpu bit.
+ */
+ if (cpumask_test_cpu(this_cpu, to_cpumask(backtrace_mask)))
+ nmi_cpu_backtrace(NULL);
+
if (!cpumask_empty(to_cpumask(backtrace_mask))) {
- pr_info("Sending NMI to %s CPUs:\n",
- (include_self ? "all" : "other"));
+ pr_info("Sending NMI from CPU %d to CPUs %*pbl:\n",
+ this_cpu, nr_cpumask_bits, to_cpumask(backtrace_mask));
raise(to_cpumask(backtrace_mask));
}
int cpu = smp_processor_id();
if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
- pr_warn("NMI backtrace for cpu %d\n", cpu);
- if (regs)
- show_regs(regs);
- else
- dump_stack();
+ if (regs && cpu_in_idle(instruction_pointer(regs))) {
+ pr_warn("NMI backtrace for cpu %d skipped: idling at pc %#lx\n",
+ cpu, instruction_pointer(regs));
+ } else {
+ pr_warn("NMI backtrace for cpu %d\n", cpu);
+ if (regs)
+ show_regs(regs);
+ else
+ dump_stack();
+ }
cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
return true;
}
alloc_flags &= ~GFP_ZONEMASK;
alloc_flags &= (GFP_ATOMIC | GFP_KERNEL);
alloc_flags |= __GFP_NOWARN;
+ /*
+ * Avoid using current->mempolicy which may already have
+ * been freed -- we may be in the process of saving the
+ * stack for exactly that __mpol_put() call.
+ */
+ alloc_flags |= __GFP_THISNODE;
page = alloc_pages(alloc_flags, STACK_ALLOC_ORDER);
if (page)
prealloc = page_address(page);
#include <linux/compiler.h>
#include <linux/export.h>
#include <linux/kasan-checks.h>
+#include <linux/thread_info.h>
#include <linux/uaccess.h>
#include <linux/kernel.h>
#include <linux/errno.h>
long retval;
kasan_check_write(dst, count);
+ check_object_size(dst, count, false);
user_access_begin();
retval = do_strncpy_from_user(dst, src, count, max);
user_access_end();
#include <linux/init.h>
#include <linux/pfn.h>
#include <linux/slab.h>
-#include <linux/bootmem.h>
#include <linux/export.h>
#include <linux/kmemleak.h>
#include <linux/range.h>
-#include <linux/memblock.h>
#include <linux/bug.h>
#include <linux/io.h>
-
-#include <asm/processor.h>
+#include <linux/bootmem.h>
#include "internal.h"
void *ptr;
if (WARN_ON_ONCE(slab_is_available()))
- return kzalloc(size, GFP_NOWAIT);
+ return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
again:
/* do not panic in alloc_bootmem_bdata() */
void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
unsigned long align, unsigned long goal)
{
- if (WARN_ON_ONCE(slab_is_available()))
- return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
-
return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
}
}
-#ifndef ARCH_LOW_ADDRESS_LIMIT
-#define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL
-#endif
-
/**
* __alloc_bootmem_low - allocate low boot memory
* @size: size of the request in bytes
#ifdef CONFIG_COMPACTION
/* Returns true if the page is within a block suitable for migration to */
-static bool suitable_migration_target(struct page *page)
+static bool suitable_migration_target(struct compact_control *cc,
+ struct page *page)
{
+ if (cc->ignore_block_suitable)
+ return true;
+
/* If the page is a large free page, then disallow migration */
if (PageBuddy(page)) {
/*
continue;
/* Check the block is suitable for migration */
- if (!suitable_migration_target(page))
+ if (!suitable_migration_target(cc, page))
continue;
/* If isolation recently failed, do not retry */
return COMPACT_CONTINUE;
/* Compaction run is not finished if the watermark is not met */
- watermark = low_wmark_pages(zone);
+ watermark = zone->watermark[cc->alloc_flags & ALLOC_WMARK_MASK];
if (!zone_watermark_ok(zone, cc->order, watermark, cc->classzone_idx,
cc->alloc_flags))
/* Job done if page is free of the right migratetype */
if (!list_empty(&area->free_list[migratetype]))
- return COMPACT_PARTIAL;
+ return COMPACT_SUCCESS;
#ifdef CONFIG_CMA
/* MIGRATE_MOVABLE can fallback on MIGRATE_CMA */
if (migratetype == MIGRATE_MOVABLE &&
!list_empty(&area->free_list[MIGRATE_CMA]))
- return COMPACT_PARTIAL;
+ return COMPACT_SUCCESS;
#endif
/*
* Job done if allocation would steal freepages from
*/
if (find_suitable_fallback(area, order, migratetype,
true, &can_steal) != -1)
- return COMPACT_PARTIAL;
+ return COMPACT_SUCCESS;
}
return COMPACT_NO_SUITABLE_PAGE;
* compaction_suitable: Is this suitable to run compaction on this zone now?
* Returns
* COMPACT_SKIPPED - If there are too few free pages for compaction
- * COMPACT_PARTIAL - If the allocation would succeed without compaction
+ * COMPACT_SUCCESS - If the allocation would succeed without compaction
* COMPACT_CONTINUE - If compaction should run now
*/
static enum compact_result __compaction_suitable(struct zone *zone, int order,
if (is_via_compact_memory(order))
return COMPACT_CONTINUE;
- watermark = low_wmark_pages(zone);
+ watermark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
/*
* If watermarks for high-order allocation are already met, there
* should be no need for compaction at all.
*/
if (zone_watermark_ok(zone, order, watermark, classzone_idx,
alloc_flags))
- return COMPACT_PARTIAL;
+ return COMPACT_SUCCESS;
/*
- * Watermarks for order-0 must be met for compaction. Note the 2UL.
- * This is because during migration, copies of pages need to be
- * allocated and for a short time, the footprint is higher
+ * Watermarks for order-0 must be met for compaction to be able to
+ * isolate free pages for migration targets. This means that the
+ * watermark and alloc_flags have to match, or be more pessimistic than
+ * the check in __isolate_free_page(). We don't use the direct
+ * compactor's alloc_flags, as they are not relevant for freepage
+ * isolation. We however do use the direct compactor's classzone_idx to
+ * skip over zones where lowmem reserves would prevent allocation even
+ * if compaction succeeds.
+ * For costly orders, we require low watermark instead of min for
+ * compaction to proceed to increase its chances.
+ * ALLOC_CMA is used, as pages in CMA pageblocks are considered
+ * suitable migration targets
*/
- watermark += (2UL << order);
+ watermark = (order > PAGE_ALLOC_COSTLY_ORDER) ?
+ low_wmark_pages(zone) : min_wmark_pages(zone);
+ watermark += compact_gap(order);
if (!__zone_watermark_ok(zone, 0, watermark, classzone_idx,
- alloc_flags, wmark_target))
+ ALLOC_CMA, wmark_target))
return COMPACT_SKIPPED;
/*
ret = compaction_suitable(zone, cc->order, cc->alloc_flags,
cc->classzone_idx);
/* Compaction is likely to fail */
- if (ret == COMPACT_PARTIAL || ret == COMPACT_SKIPPED)
+ if (ret == COMPACT_SUCCESS || ret == COMPACT_SKIPPED)
return ret;
/* huh, compaction_suitable is returning something unexpected */
/*
* Setup to move all movable pages to the end of the zone. Used cached
- * information on where the scanners should start but check that it
- * is initialised by ensuring the values are within zone boundaries.
+ * information on where the scanners should start (unless we explicitly
+ * want to compact the whole zone), but check that it is initialised
+ * by ensuring the values are within zone boundaries.
*/
- cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync];
- cc->free_pfn = zone->compact_cached_free_pfn;
- if (cc->free_pfn < start_pfn || cc->free_pfn >= end_pfn) {
- cc->free_pfn = pageblock_start_pfn(end_pfn - 1);
- zone->compact_cached_free_pfn = cc->free_pfn;
- }
- if (cc->migrate_pfn < start_pfn || cc->migrate_pfn >= end_pfn) {
+ if (cc->whole_zone) {
cc->migrate_pfn = start_pfn;
- zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
- zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
- }
+ cc->free_pfn = pageblock_start_pfn(end_pfn - 1);
+ } else {
+ cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync];
+ cc->free_pfn = zone->compact_cached_free_pfn;
+ if (cc->free_pfn < start_pfn || cc->free_pfn >= end_pfn) {
+ cc->free_pfn = pageblock_start_pfn(end_pfn - 1);
+ zone->compact_cached_free_pfn = cc->free_pfn;
+ }
+ if (cc->migrate_pfn < start_pfn || cc->migrate_pfn >= end_pfn) {
+ cc->migrate_pfn = start_pfn;
+ zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
+ zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
+ }
- if (cc->migrate_pfn == start_pfn)
- cc->whole_zone = true;
+ if (cc->migrate_pfn == start_pfn)
+ cc->whole_zone = true;
+ }
cc->last_migrated_pfn = 0;
.alloc_flags = alloc_flags,
.classzone_idx = classzone_idx,
.direct_compaction = true,
+ .whole_zone = (prio == MIN_COMPACT_PRIORITY),
+ .ignore_skip_hint = (prio == MIN_COMPACT_PRIORITY),
+ .ignore_block_suitable = (prio == MIN_COMPACT_PRIORITY)
};
INIT_LIST_HEAD(&cc.freepages);
INIT_LIST_HEAD(&cc.migratepages);
ac->nodemask) {
enum compact_result status;
- if (compaction_deferred(zone, order)) {
+ if (prio > MIN_COMPACT_PRIORITY
+ && compaction_deferred(zone, order)) {
rc = max_t(enum compact_result, COMPACT_DEFERRED, rc);
continue;
}
alloc_flags, ac_classzone_idx(ac));
rc = max(status, rc);
- /* If a normal allocation would succeed, stop compacting */
- if (zone_watermark_ok(zone, order, low_wmark_pages(zone),
- ac_classzone_idx(ac), alloc_flags)) {
+ /* The allocation should succeed, stop compacting */
+ if (status == COMPACT_SUCCESS) {
/*
* We think the allocation will succeed in this zone,
* but it is not certain, hence the false. The caller
/* Compact all zones within a node */
-static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
+static void compact_node(int nid)
{
+ pg_data_t *pgdat = NODE_DATA(nid);
int zoneid;
struct zone *zone;
+ struct compact_control cc = {
+ .order = -1,
+ .mode = MIGRATE_SYNC,
+ .ignore_skip_hint = true,
+ .whole_zone = true,
+ };
+
for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
if (!populated_zone(zone))
continue;
- cc->nr_freepages = 0;
- cc->nr_migratepages = 0;
- cc->zone = zone;
- INIT_LIST_HEAD(&cc->freepages);
- INIT_LIST_HEAD(&cc->migratepages);
-
- /*
- * When called via /proc/sys/vm/compact_memory
- * this makes sure we compact the whole zone regardless of
- * cached scanner positions.
- */
- if (is_via_compact_memory(cc->order))
- __reset_isolation_suitable(zone);
-
- if (is_via_compact_memory(cc->order) ||
- !compaction_deferred(zone, cc->order))
- compact_zone(zone, cc);
-
- VM_BUG_ON(!list_empty(&cc->freepages));
- VM_BUG_ON(!list_empty(&cc->migratepages));
+ cc.nr_freepages = 0;
+ cc.nr_migratepages = 0;
+ cc.zone = zone;
+ INIT_LIST_HEAD(&cc.freepages);
+ INIT_LIST_HEAD(&cc.migratepages);
- if (is_via_compact_memory(cc->order))
- continue;
+ compact_zone(zone, &cc);
- if (zone_watermark_ok(zone, cc->order,
- low_wmark_pages(zone), 0, 0))
- compaction_defer_reset(zone, cc->order, false);
+ VM_BUG_ON(!list_empty(&cc.freepages));
+ VM_BUG_ON(!list_empty(&cc.migratepages));
}
}
-void compact_pgdat(pg_data_t *pgdat, int order)
-{
- struct compact_control cc = {
- .order = order,
- .mode = MIGRATE_ASYNC,
- };
-
- if (!order)
- return;
-
- __compact_pgdat(pgdat, &cc);
-}
-
-static void compact_node(int nid)
-{
- struct compact_control cc = {
- .order = -1,
- .mode = MIGRATE_SYNC,
- .ignore_skip_hint = true,
- };
-
- __compact_pgdat(NODE_DATA(nid), &cc);
-}
-
/* Compact all nodes in the system */
static void compact_nodes(void)
{
.ignore_skip_hint = true,
};
- bool success = false;
-
trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order,
cc.classzone_idx);
count_vm_event(KCOMPACTD_WAKE);
return;
status = compact_zone(zone, &cc);
- if (zone_watermark_ok(zone, cc.order, low_wmark_pages(zone),
- cc.classzone_idx, 0)) {
- success = true;
+ if (status == COMPACT_SUCCESS) {
compaction_defer_reset(zone, cc.order, false);
} else if (status == COMPACT_PARTIAL_SKIPPED || status == COMPACT_COMPLETE) {
/*
void __dump_page(struct page *page, const char *reason)
{
+ int mapcount = PageSlab(page) ? 0 : page_mapcount(page);
+
pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx",
- page, page_ref_count(page), page_mapcount(page),
- page->mapping, page->index);
+ page, page_ref_count(page), mapcount,
+ page->mapping, page_to_pgoff(page));
if (PageCompound(page))
pr_cont(" compound_mapcount: %d", compound_mapcount(page));
pr_cont("\n");
* wait_on_page_locked is used to avoid unnecessarily
* serialisations and why it's safe.
*/
- wait_on_page_locked_killable(page);
+ error = wait_on_page_locked_killable(page);
+ if (unlikely(error))
+ goto readpage_error;
if (PageUptodate(page))
goto page_ok;
static atomic_t huge_zero_refcount;
struct page *huge_zero_page __read_mostly;
-struct page *get_huge_zero_page(void)
+static struct page *get_huge_zero_page(void)
{
struct page *zero_page;
retry:
return READ_ONCE(huge_zero_page);
}
-void put_huge_zero_page(void)
+static void put_huge_zero_page(void)
{
/*
* Counter should never go to zero here. Only shrinker can put
BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
}
+struct page *mm_get_huge_zero_page(struct mm_struct *mm)
+{
+ if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
+ return READ_ONCE(huge_zero_page);
+
+ if (!get_huge_zero_page())
+ return NULL;
+
+ if (test_and_set_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
+ put_huge_zero_page();
+
+ return READ_ONCE(huge_zero_page);
+}
+
+void mm_put_huge_zero_page(struct mm_struct *mm)
+{
+ if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
+ put_huge_zero_page();
+}
+
static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink,
struct shrink_control *sc)
{
set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR);
}
+unsigned long __thp_get_unmapped_area(struct file *filp, unsigned long len,
+ loff_t off, unsigned long flags, unsigned long size)
+{
+ unsigned long addr;
+ loff_t off_end = off + len;
+ loff_t off_align = round_up(off, size);
+ unsigned long len_pad;
+
+ if (off_end <= off_align || (off_end - off_align) < size)
+ return 0;
+
+ len_pad = len + size;
+ if (len_pad < len || (off + len_pad) < off)
+ return 0;
+
+ addr = current->mm->get_unmapped_area(filp, 0, len_pad,
+ off >> PAGE_SHIFT, flags);
+ if (IS_ERR_VALUE(addr))
+ return 0;
+
+ addr += (off - addr) & (size - 1);
+ return addr;
+}
+
+unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
+ unsigned long len, unsigned long pgoff, unsigned long flags)
+{
+ loff_t off = (loff_t)pgoff << PAGE_SHIFT;
+
+ if (addr)
+ goto out;
+ if (!IS_DAX(filp->f_mapping->host) || !IS_ENABLED(CONFIG_FS_DAX_PMD))
+ goto out;
+
+ addr = __thp_get_unmapped_area(filp, len, off, flags, PMD_SIZE);
+ if (addr)
+ return addr;
+
+ out:
+ return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags);
+}
+EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
+
static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page,
gfp_t gfp)
{
pgtable = pte_alloc_one(vma->vm_mm, haddr);
if (unlikely(!pgtable))
return VM_FAULT_OOM;
- zero_page = get_huge_zero_page();
+ zero_page = mm_get_huge_zero_page(vma->vm_mm);
if (unlikely(!zero_page)) {
pte_free(vma->vm_mm, pgtable);
count_vm_event(THP_FAULT_FALLBACK);
}
} else
spin_unlock(fe->ptl);
- if (!set) {
+ if (!set)
pte_free(vma->vm_mm, pgtable);
- put_huge_zero_page();
- }
return ret;
}
gfp = alloc_hugepage_direct_gfpmask(vma);
* since we already have a zero page to copy. It just takes a
* reference.
*/
- zero_page = get_huge_zero_page();
+ zero_page = mm_get_huge_zero_page(dst_mm);
set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd,
zero_page);
ret = 0;
update_mmu_cache_pmd(vma, fe->address, fe->pmd);
if (!page) {
add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
- put_huge_zero_page();
} else {
VM_BUG_ON_PAGE(!PageHead(page), page);
page_remove_rmap(page, true);
}
smp_wmb(); /* make pte visible before pmd */
pmd_populate(mm, pmd, pgtable);
- put_huge_zero_page();
}
static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
if (!vma_is_anonymous(vma)) {
_pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
- if (is_huge_zero_pmd(_pmd))
- put_huge_zero_page();
if (vma_is_dax(vma))
return;
page = pmd_page(_pmd);
unsigned long last_migrated_pfn;/* Not yet flushed page being freed */
enum migrate_mode mode; /* Async or sync migration mode */
bool ignore_skip_hint; /* Scan blocks even if marked skip */
+ bool ignore_block_suitable; /* Scan blocks considered unsuitable */
bool direct_compaction; /* False from kcompactd or /proc/... */
- bool whole_zone; /* Whole zone has been scanned */
+ bool whole_zone; /* Whole zone should/has been scanned */
int order; /* order a direct compactor needs */
const gfp_t gfp_mask; /* gfp mask of a direct compactor */
const unsigned int alloc_flags; /* alloc flags of a direct compactor */
* value (scan code).
*/
-static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address)
+static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
+ struct vm_area_struct **vmap)
{
struct vm_area_struct *vma;
unsigned long hstart, hend;
if (unlikely(khugepaged_test_exit(mm)))
return SCAN_ANY_PROCESS;
- vma = find_vma(mm, address);
+ *vmap = vma = find_vma(mm, address);
if (!vma)
return SCAN_VMA_NULL;
.pmd = pmd,
};
+ /* we only decide to swapin, if there is enough young ptes */
+ if (referenced < HPAGE_PMD_NR/2) {
+ trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
+ return false;
+ }
fe.pte = pte_offset_map(pmd, address);
for (; fe.address < address + HPAGE_PMD_NR*PAGE_SIZE;
fe.pte++, fe.address += PAGE_SIZE) {
if (!is_swap_pte(pteval))
continue;
swapped_in++;
- /* we only decide to swapin, if there is enough young ptes */
- if (referenced < HPAGE_PMD_NR/2) {
- trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
- return false;
- }
ret = do_swap_page(&fe, pteval);
/* do_swap_page returns VM_FAULT_RETRY with released mmap_sem */
if (ret & VM_FAULT_RETRY) {
down_read(&mm->mmap_sem);
- if (hugepage_vma_revalidate(mm, address)) {
+ if (hugepage_vma_revalidate(mm, address, &fe.vma)) {
/* vma is no longer available, don't continue to swapin */
trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
return false;
static void collapse_huge_page(struct mm_struct *mm,
unsigned long address,
struct page **hpage,
- struct vm_area_struct *vma,
int node, int referenced)
{
pmd_t *pmd, _pmd;
spinlock_t *pmd_ptl, *pte_ptl;
int isolated = 0, result = 0;
struct mem_cgroup *memcg;
+ struct vm_area_struct *vma;
unsigned long mmun_start; /* For mmu_notifiers */
unsigned long mmun_end; /* For mmu_notifiers */
gfp_t gfp;
}
down_read(&mm->mmap_sem);
- result = hugepage_vma_revalidate(mm, address);
+ result = hugepage_vma_revalidate(mm, address, &vma);
if (result) {
mem_cgroup_cancel_charge(new_page, memcg, true);
up_read(&mm->mmap_sem);
* handled by the anon_vma lock + PG_lock.
*/
down_write(&mm->mmap_sem);
- result = hugepage_vma_revalidate(mm, address);
+ result = hugepage_vma_revalidate(mm, address, &vma);
if (result)
goto out;
/* check if the pmd is still valid */
if (ret) {
node = khugepaged_find_target_node();
/* collapse_huge_page will return with the mmap_sem released */
- collapse_huge_page(mm, address, hpage, vma, node, referenced);
+ collapse_huge_page(mm, address, hpage, node, referenced);
}
out:
trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced,
return memblock.memory.total_size;
}
+phys_addr_t __init_memblock memblock_reserved_size(void)
+{
+ return memblock.reserved.total_size;
+}
+
phys_addr_t __init memblock_mem_size(unsigned long limit_pfn)
{
unsigned long pages = 0;
iter != NULL; \
iter = mem_cgroup_iter(NULL, iter, NULL))
+/**
+ * mem_cgroup_scan_tasks - iterate over tasks of a memory cgroup hierarchy
+ * @memcg: hierarchy root
+ * @fn: function to call for each task
+ * @arg: argument passed to @fn
+ *
+ * This function iterates over tasks attached to @memcg or to any of its
+ * descendants and calls @fn for each task. If @fn returns a non-zero
+ * value, the function breaks the iteration loop and returns the value.
+ * Otherwise, it will iterate over all tasks and return 0.
+ *
+ * This function must not be called for the root memory cgroup.
+ */
+int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
+ int (*fn)(struct task_struct *, void *), void *arg)
+{
+ struct mem_cgroup *iter;
+ int ret = 0;
+
+ BUG_ON(memcg == root_mem_cgroup);
+
+ for_each_mem_cgroup_tree(iter, memcg) {
+ struct css_task_iter it;
+ struct task_struct *task;
+
+ css_task_iter_start(&iter->css, &it);
+ while (!ret && (task = css_task_iter_next(&it)))
+ ret = fn(task, arg);
+ css_task_iter_end(&it);
+ if (ret) {
+ mem_cgroup_iter_break(memcg, iter);
+ break;
+ }
+ }
+ return ret;
+}
+
/**
* mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page
* @page: the page
/*
* Return the memory (and swap, if configured) limit for a memcg.
*/
-static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
+unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
{
unsigned long limit;
.gfp_mask = gfp_mask,
.order = order,
};
- struct mem_cgroup *iter;
- unsigned long chosen_points = 0;
- unsigned long totalpages;
- unsigned int points = 0;
- struct task_struct *chosen = NULL;
+ bool ret;
mutex_lock(&oom_lock);
-
- /*
- * If current has a pending SIGKILL or is exiting, then automatically
- * select it. The goal is to allow it to allocate so that it may
- * quickly exit and free its memory.
- */
- if (task_will_free_mem(current)) {
- mark_oom_victim(current);
- wake_oom_reaper(current);
- goto unlock;
- }
-
- check_panic_on_oom(&oc, CONSTRAINT_MEMCG);
- totalpages = mem_cgroup_get_limit(memcg) ? : 1;
- for_each_mem_cgroup_tree(iter, memcg) {
- struct css_task_iter it;
- struct task_struct *task;
-
- css_task_iter_start(&iter->css, &it);
- while ((task = css_task_iter_next(&it))) {
- switch (oom_scan_process_thread(&oc, task)) {
- case OOM_SCAN_SELECT:
- if (chosen)
- put_task_struct(chosen);
- chosen = task;
- chosen_points = ULONG_MAX;
- get_task_struct(chosen);
- /* fall through */
- case OOM_SCAN_CONTINUE:
- continue;
- case OOM_SCAN_ABORT:
- css_task_iter_end(&it);
- mem_cgroup_iter_break(memcg, iter);
- if (chosen)
- put_task_struct(chosen);
- /* Set a dummy value to return "true". */
- chosen = (void *) 1;
- goto unlock;
- case OOM_SCAN_OK:
- break;
- };
- points = oom_badness(task, memcg, NULL, totalpages);
- if (!points || points < chosen_points)
- continue;
- /* Prefer thread group leaders for display purposes */
- if (points == chosen_points &&
- thread_group_leader(chosen))
- continue;
-
- if (chosen)
- put_task_struct(chosen);
- chosen = task;
- chosen_points = points;
- get_task_struct(chosen);
- }
- css_task_iter_end(&it);
- }
-
- if (chosen) {
- points = chosen_points * 1000 / totalpages;
- oom_kill_process(&oc, chosen, points, totalpages,
- "Memory cgroup out of memory");
- }
-unlock:
+ ret = out_of_memory(&oc);
mutex_unlock(&oom_lock);
- return chosen;
+ return ret;
}
#if MAX_NUMNODES > 1
if (!memcg)
return false;
- if (!handle || oom_killer_disabled)
+ if (!handle)
goto cleanup;
owait.memcg = memcg;
static void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n)
{
+ VM_BUG_ON(atomic_read(&memcg->id.ref) <= 0);
atomic_add(n, &memcg->id.ref);
}
static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n)
{
+ VM_BUG_ON(atomic_read(&memcg->id.ref) < n);
if (atomic_sub_and_test(n, &memcg->id.ref)) {
idr_remove(&mem_cgroup_idr, memcg->id.id);
memcg->id.id = 0;
static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+
/* Online state pins memcg ID, memcg ID pins CSS */
- mem_cgroup_id_get(mem_cgroup_from_css(css));
+ atomic_set(&memcg->id.ref, 1);
css_get(css);
return 0;
}
* Because lookup_swap_cache() updates some statistics counter,
* we call find_get_page() with swapper_space directly.
*/
- page = find_get_page(swap_address_space(ent), ent.val);
+ page = find_get_page(swap_address_space(ent), swp_offset(ent));
if (do_memsw_account())
entry->val = ent.val;
swp_entry_t swp = radix_to_swp_entry(page);
if (do_memsw_account())
*entry = swp;
- page = find_get_page(swap_address_space(swp), swp.val);
+ page = find_get_page(swap_address_space(swp),
+ swp_offset(swp));
}
} else
page = find_get_page(mapping, pgoff);
.mm = mm,
};
down_read(&mm->mmap_sem);
- walk_page_range(0, ~0UL, &mem_cgroup_count_precharge_walk);
+ walk_page_range(0, mm->highest_vm_end,
+ &mem_cgroup_count_precharge_walk);
up_read(&mm->mmap_sem);
precharge = mc.precharge;
* When we have consumed all precharges and failed in doing
* additional charge, the page walk just aborts.
*/
- walk_page_range(0, ~0UL, &mem_cgroup_move_charge_walk);
+ walk_page_range(0, mc.mm->highest_vm_end, &mem_cgroup_move_charge_walk);
+
up_read(&mc.mm->mmap_sem);
atomic_dec(&mc.from->moving_account);
}
int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
pfn_t pfn)
{
+ pgprot_t pgprot = vma->vm_page_prot;
+
BUG_ON(!(vma->vm_flags & VM_MIXEDMAP));
if (addr < vma->vm_start || addr >= vma->vm_end)
return -EFAULT;
+ if (track_pfn_insert(vma, &pgprot, pfn))
+ return -EINVAL;
/*
* If we don't have pte special, then we have to use the pfn_valid()
* result in pfn_t_has_page() == false.
*/
page = pfn_to_page(pfn_t_to_pfn(pfn));
- return insert_page(vma, addr, page, vma->vm_page_prot);
+ return insert_page(vma, addr, page, pgprot);
}
- return insert_pfn(vma, addr, pfn, vma->vm_page_prot);
+ return insert_pfn(vma, addr, pfn, pgprot);
}
EXPORT_SYMBOL(vm_insert_mixed);
mem_cgroup_oom_synchronize(false);
}
+ /*
+ * This mm has been already reaped by the oom reaper and so the
+ * refault cannot be trusted in general. Anonymous refaults would
+ * lose data and give a zero page instead e.g. This is especially
+ * problem for use_mm() because regular tasks will just die and
+ * the corrupted data will not be visible anywhere while kthread
+ * will outlive the oom victim and potentially propagate the date
+ * further.
+ */
+ if (unlikely((current->flags & PF_KTHREAD) && !(ret & VM_FAULT_ERROR)
+ && test_bit(MMF_UNSTABLE, &vma->vm_mm->flags)))
+ ret = VM_FAULT_SIGBUS;
+
return ret;
}
EXPORT_SYMBOL_GPL(handle_mm_fault);
return alloc_huge_page_node(page_hstate(compound_head(page)),
next_node_in(nid, nmask));
- node_clear(nid, nmask);
+ if (nid != next_node_in(nid, nmask))
+ node_clear(nid, nmask);
+
if (PageHighMem(page)
|| (zone_idx(page_zone(page)) == ZONE_MOVABLE))
gfp_mask |= __GFP_HIGHMEM;
*/
struct zonelist *zonelist;
enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
- zonelist = &NODE_DATA(node)->node_zonelists[0];
+ zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
z = first_zones_zonelist(zonelist, highest_zoneidx,
&policy->v.nodes);
return z->zone ? z->zone->node : node;
*/
if (radix_tree_exceptional_entry(page)) {
swp_entry_t swp = radix_to_swp_entry(page);
- page = find_get_page(swap_address_space(swp), swp.val);
+ page = find_get_page(swap_address_space(swp),
+ swp_offset(swp));
}
} else
page = find_get_page(mapping, pgoff);
} else {
#ifdef CONFIG_SWAP
*vec = mincore_page(swap_address_space(entry),
- entry.val);
+ swp_offset(entry));
#else
WARN_ON(1);
*vec = 1;
int nr_pages;
int ret = 0;
int lock = !!(newflags & VM_LOCKED);
+ vm_flags_t old_flags = vma->vm_flags;
if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) ||
is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm))
nr_pages = (end - start) >> PAGE_SHIFT;
if (!lock)
nr_pages = -nr_pages;
+ else if (old_flags & VM_LOCKED)
+ nr_pages = 0;
mm->locked_vm += nr_pages;
/*
return error;
}
+/*
+ * Go through vma areas and sum size of mlocked
+ * vma pages, as return value.
+ * Note deferred memory locking case(mlock2(,,MLOCK_ONFAULT)
+ * is also counted.
+ * Return value: previously mlocked page counts
+ */
+static int count_mm_mlocked_page_nr(struct mm_struct *mm,
+ unsigned long start, size_t len)
+{
+ struct vm_area_struct *vma;
+ int count = 0;
+
+ if (mm == NULL)
+ mm = current->mm;
+
+ vma = find_vma(mm, start);
+ if (vma == NULL)
+ vma = mm->mmap;
+
+ for (; vma ; vma = vma->vm_next) {
+ if (start >= vma->vm_end)
+ continue;
+ if (start + len <= vma->vm_start)
+ break;
+ if (vma->vm_flags & VM_LOCKED) {
+ if (start > vma->vm_start)
+ count -= (start - vma->vm_start);
+ if (start + len < vma->vm_end) {
+ count += start + len - vma->vm_start;
+ break;
+ }
+ count += vma->vm_end - vma->vm_start;
+ }
+ }
+
+ return count >> PAGE_SHIFT;
+}
+
static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t flags)
{
unsigned long locked;
return -EINTR;
locked += current->mm->locked_vm;
+ if ((locked > lock_limit) && (!capable(CAP_IPC_LOCK))) {
+ /*
+ * It is possible that the regions requested intersect with
+ * previously mlocked areas, that part area in "mm->locked_vm"
+ * should not be counted to new mlock increment count. So check
+ * and adjust locked count if necessary.
+ */
+ locked -= count_mm_mlocked_page_nr(current->mm,
+ start, len);
+ }
/* check against resource limits */
if ((locked <= lock_limit) || capable(CAP_IPC_LOCK))
#include <linux/init.h>
#include <linux/pfn.h>
#include <linux/slab.h>
-#include <linux/bootmem.h>
#include <linux/export.h>
#include <linux/kmemleak.h>
#include <linux/range.h>
#include <linux/memblock.h>
+#include <linux/bootmem.h>
#include <asm/bug.h>
#include <asm/io.h>
-#include <asm/processor.h>
#include "internal.h"
+#ifndef CONFIG_HAVE_MEMBLOCK
+#error CONFIG_HAVE_MEMBLOCK not defined
+#endif
+
#ifndef CONFIG_NEED_MULTIPLE_NODES
struct pglist_data __refdata contig_page_data;
EXPORT_SYMBOL(contig_page_data);
return __alloc_bootmem_node(pgdat, size, align, goal);
}
-#ifndef ARCH_LOW_ADDRESS_LIMIT
-#define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL
-#endif
/**
* __alloc_bootmem_low - allocate low boot memory
return oc->order == -1;
}
+static inline bool is_memcg_oom(struct oom_control *oc)
+{
+ return oc->memcg != NULL;
+}
+
/* return true if the task is not adequate as candidate victim task. */
static bool oom_unkillable_task(struct task_struct *p,
struct mem_cgroup *memcg, const nodemask_t *nodemask)
*/
adj = (long)p->signal->oom_score_adj;
if (adj == OOM_SCORE_ADJ_MIN ||
- test_bit(MMF_OOM_REAPED, &p->mm->flags) ||
+ test_bit(MMF_OOM_SKIP, &p->mm->flags) ||
in_vfork(p)) {
task_unlock(p);
return 0;
return points > 0 ? points : 1;
}
+enum oom_constraint {
+ CONSTRAINT_NONE,
+ CONSTRAINT_CPUSET,
+ CONSTRAINT_MEMORY_POLICY,
+ CONSTRAINT_MEMCG,
+};
+
/*
* Determine the type of allocation constraint.
*/
-#ifdef CONFIG_NUMA
-static enum oom_constraint constrained_alloc(struct oom_control *oc,
- unsigned long *totalpages)
+static enum oom_constraint constrained_alloc(struct oom_control *oc)
{
struct zone *zone;
struct zoneref *z;
bool cpuset_limited = false;
int nid;
+ if (is_memcg_oom(oc)) {
+ oc->totalpages = mem_cgroup_get_limit(oc->memcg) ?: 1;
+ return CONSTRAINT_MEMCG;
+ }
+
/* Default to all available memory */
- *totalpages = totalram_pages + total_swap_pages;
+ oc->totalpages = totalram_pages + total_swap_pages;
+
+ if (!IS_ENABLED(CONFIG_NUMA))
+ return CONSTRAINT_NONE;
if (!oc->zonelist)
return CONSTRAINT_NONE;
*/
if (oc->nodemask &&
!nodes_subset(node_states[N_MEMORY], *oc->nodemask)) {
- *totalpages = total_swap_pages;
+ oc->totalpages = total_swap_pages;
for_each_node_mask(nid, *oc->nodemask)
- *totalpages += node_spanned_pages(nid);
+ oc->totalpages += node_spanned_pages(nid);
return CONSTRAINT_MEMORY_POLICY;
}
cpuset_limited = true;
if (cpuset_limited) {
- *totalpages = total_swap_pages;
+ oc->totalpages = total_swap_pages;
for_each_node_mask(nid, cpuset_current_mems_allowed)
- *totalpages += node_spanned_pages(nid);
+ oc->totalpages += node_spanned_pages(nid);
return CONSTRAINT_CPUSET;
}
return CONSTRAINT_NONE;
}
-#else
-static enum oom_constraint constrained_alloc(struct oom_control *oc,
- unsigned long *totalpages)
-{
- *totalpages = totalram_pages + total_swap_pages;
- return CONSTRAINT_NONE;
-}
-#endif
-enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
- struct task_struct *task)
+static int oom_evaluate_task(struct task_struct *task, void *arg)
{
+ struct oom_control *oc = arg;
+ unsigned long points;
+
if (oom_unkillable_task(task, NULL, oc->nodemask))
- return OOM_SCAN_CONTINUE;
+ goto next;
/*
* This task already has access to memory reserves and is being killed.
* Don't allow any other task to have access to the reserves unless
- * the task has MMF_OOM_REAPED because chances that it would release
+ * the task has MMF_OOM_SKIP because chances that it would release
* any memory is quite low.
*/
- if (!is_sysrq_oom(oc) && atomic_read(&task->signal->oom_victims)) {
- struct task_struct *p = find_lock_task_mm(task);
- enum oom_scan_t ret = OOM_SCAN_ABORT;
-
- if (p) {
- if (test_bit(MMF_OOM_REAPED, &p->mm->flags))
- ret = OOM_SCAN_CONTINUE;
- task_unlock(p);
- }
-
- return ret;
+ if (!is_sysrq_oom(oc) && tsk_is_oom_victim(task)) {
+ if (test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags))
+ goto next;
+ goto abort;
}
/*
* If task is allocating a lot of memory and has been marked to be
* killed first if it triggers an oom, then select it.
*/
- if (oom_task_origin(task))
- return OOM_SCAN_SELECT;
+ if (oom_task_origin(task)) {
+ points = ULONG_MAX;
+ goto select;
+ }
- return OOM_SCAN_OK;
+ points = oom_badness(task, NULL, oc->nodemask, oc->totalpages);
+ if (!points || points < oc->chosen_points)
+ goto next;
+
+ /* Prefer thread group leaders for display purposes */
+ if (points == oc->chosen_points && thread_group_leader(oc->chosen))
+ goto next;
+select:
+ if (oc->chosen)
+ put_task_struct(oc->chosen);
+ get_task_struct(task);
+ oc->chosen = task;
+ oc->chosen_points = points;
+next:
+ return 0;
+abort:
+ if (oc->chosen)
+ put_task_struct(oc->chosen);
+ oc->chosen = (void *)-1UL;
+ return 1;
}
/*
- * Simple selection loop. We chose the process with the highest
- * number of 'points'. Returns -1 on scan abort.
+ * Simple selection loop. We choose the process with the highest number of
+ * 'points'. In case scan was aborted, oc->chosen is set to -1.
*/
-static struct task_struct *select_bad_process(struct oom_control *oc,
- unsigned int *ppoints, unsigned long totalpages)
+static void select_bad_process(struct oom_control *oc)
{
- struct task_struct *p;
- struct task_struct *chosen = NULL;
- unsigned long chosen_points = 0;
-
- rcu_read_lock();
- for_each_process(p) {
- unsigned int points;
-
- switch (oom_scan_process_thread(oc, p)) {
- case OOM_SCAN_SELECT:
- chosen = p;
- chosen_points = ULONG_MAX;
- /* fall through */
- case OOM_SCAN_CONTINUE:
- continue;
- case OOM_SCAN_ABORT:
- rcu_read_unlock();
- return (struct task_struct *)(-1UL);
- case OOM_SCAN_OK:
- break;
- };
- points = oom_badness(p, NULL, oc->nodemask, totalpages);
- if (!points || points < chosen_points)
- continue;
+ if (is_memcg_oom(oc))
+ mem_cgroup_scan_tasks(oc->memcg, oom_evaluate_task, oc);
+ else {
+ struct task_struct *p;
- chosen = p;
- chosen_points = points;
+ rcu_read_lock();
+ for_each_process(p)
+ if (oom_evaluate_task(p, oc))
+ break;
+ rcu_read_unlock();
}
- if (chosen)
- get_task_struct(chosen);
- rcu_read_unlock();
- *ppoints = chosen_points * 1000 / totalpages;
- return chosen;
+ oc->chosen_points = oc->chosen_points * 1000 / oc->totalpages;
}
/**
pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n",
current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order,
current->signal->oom_score_adj);
+ if (!IS_ENABLED(CONFIG_COMPACTION) && oc->order)
+ pr_warn("COMPACTION is disabled!!!\n");
cpuset_print_current_mems_allowed();
dump_stack();
static atomic_t oom_victims = ATOMIC_INIT(0);
static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait);
-bool oom_killer_disabled __read_mostly;
+static bool oom_killer_disabled __read_mostly;
#define K(x) ((x) << (PAGE_SHIFT-10))
static struct task_struct *oom_reaper_list;
static DEFINE_SPINLOCK(oom_reaper_lock);
-static bool __oom_reap_task(struct task_struct *tsk)
+static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
{
struct mmu_gather tlb;
struct vm_area_struct *vma;
- struct mm_struct *mm = NULL;
- struct task_struct *p;
struct zap_details details = {.check_swap_entries = true,
.ignore_dirty = true};
bool ret = true;
/*
* We have to make sure to not race with the victim exit path
* and cause premature new oom victim selection:
- * __oom_reap_task exit_mm
+ * __oom_reap_task_mm exit_mm
* mmget_not_zero
* mmput
* atomic_dec_and_test
*/
mutex_lock(&oom_lock);
- /*
- * Make sure we find the associated mm_struct even when the particular
- * thread has already terminated and cleared its mm.
- * We might have race with exit path so consider our work done if there
- * is no mm.
- */
- p = find_lock_task_mm(tsk);
- if (!p)
- goto unlock_oom;
- mm = p->mm;
- atomic_inc(&mm->mm_count);
- task_unlock(p);
-
if (!down_read_trylock(&mm->mmap_sem)) {
ret = false;
- goto mm_drop;
+ goto unlock_oom;
}
/*
*/
if (!mmget_not_zero(mm)) {
up_read(&mm->mmap_sem);
- goto mm_drop;
+ goto unlock_oom;
}
+ /*
+ * Tell all users of get_user/copy_from_user etc... that the content
+ * is no longer stable. No barriers really needed because unmapping
+ * should imply barriers already and the reader would hit a page fault
+ * if it stumbled over a reaped memory.
+ */
+ set_bit(MMF_UNSTABLE, &mm->flags);
+
tlb_gather_mmu(&tlb, mm, 0, -1);
for (vma = mm->mmap ; vma; vma = vma->vm_next) {
if (is_vm_hugetlb_page(vma))
K(get_mm_counter(mm, MM_SHMEMPAGES)));
up_read(&mm->mmap_sem);
- /*
- * This task can be safely ignored because we cannot do much more
- * to release its memory.
- */
- set_bit(MMF_OOM_REAPED, &mm->flags);
/*
* Drop our reference but make sure the mmput slow path is called from a
* different context because we shouldn't risk we get stuck there and
* put the oom_reaper out of the way.
*/
mmput_async(mm);
-mm_drop:
- mmdrop(mm);
unlock_oom:
mutex_unlock(&oom_lock);
return ret;
static void oom_reap_task(struct task_struct *tsk)
{
int attempts = 0;
+ struct mm_struct *mm = tsk->signal->oom_mm;
/* Retry the down_read_trylock(mmap_sem) a few times */
- while (attempts++ < MAX_OOM_REAP_RETRIES && !__oom_reap_task(tsk))
+ while (attempts++ < MAX_OOM_REAP_RETRIES && !__oom_reap_task_mm(tsk, mm))
schedule_timeout_idle(HZ/10);
- if (attempts > MAX_OOM_REAP_RETRIES) {
- struct task_struct *p;
+ if (attempts <= MAX_OOM_REAP_RETRIES)
+ goto done;
- pr_info("oom_reaper: unable to reap pid:%d (%s)\n",
- task_pid_nr(tsk), tsk->comm);
- /*
- * If we've already tried to reap this task in the past and
- * failed it probably doesn't make much sense to try yet again
- * so hide the mm from the oom killer so that it can move on
- * to another task with a different mm struct.
- */
- p = find_lock_task_mm(tsk);
- if (p) {
- if (test_and_set_bit(MMF_OOM_NOT_REAPABLE, &p->mm->flags)) {
- pr_info("oom_reaper: giving up pid:%d (%s)\n",
- task_pid_nr(tsk), tsk->comm);
- set_bit(MMF_OOM_REAPED, &p->mm->flags);
- }
- task_unlock(p);
- }
+ pr_info("oom_reaper: unable to reap pid:%d (%s)\n",
+ task_pid_nr(tsk), tsk->comm);
+ debug_show_all_locks();
- debug_show_all_locks();
- }
+done:
+ tsk->oom_reaper_list = NULL;
/*
- * Clear TIF_MEMDIE because the task shouldn't be sitting on a
- * reasonably reclaimable memory anymore or it is not a good candidate
- * for the oom victim right now because it cannot release its memory
- * itself nor by the oom reaper.
+ * Hide this mm from OOM killer because it has been either reaped or
+ * somebody can't call up_write(mmap_sem).
*/
- tsk->oom_reaper_list = NULL;
- exit_oom_victim(tsk);
+ set_bit(MMF_OOM_SKIP, &mm->flags);
/* Drop a reference taken by wake_oom_reaper */
put_task_struct(tsk);
static int oom_reaper(void *unused)
{
- set_freezable();
-
while (true) {
struct task_struct *tsk = NULL;
return 0;
}
-void wake_oom_reaper(struct task_struct *tsk)
+static void wake_oom_reaper(struct task_struct *tsk)
{
if (!oom_reaper_th)
return;
return 0;
}
subsys_initcall(oom_init)
-#endif
+#else
+static inline void wake_oom_reaper(struct task_struct *tsk)
+{
+}
+#endif /* CONFIG_MMU */
/**
* mark_oom_victim - mark the given task as OOM victim
*
* Has to be called with oom_lock held and never after
* oom has been disabled already.
+ *
+ * tsk->mm has to be non NULL and caller has to guarantee it is stable (either
+ * under task_lock or operate on the current).
*/
-void mark_oom_victim(struct task_struct *tsk)
+static void mark_oom_victim(struct task_struct *tsk)
{
+ struct mm_struct *mm = tsk->mm;
+
WARN_ON(oom_killer_disabled);
/* OOM killer might race with memcg OOM */
if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE))
return;
- atomic_inc(&tsk->signal->oom_victims);
+
+ /* oom_mm is bound to the signal struct life time. */
+ if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm))
+ atomic_inc(&tsk->signal->oom_mm->mm_count);
+
/*
* Make sure that the task is woken up from uninterruptible sleep
* if it is frozen because OOM killer wouldn't be able to free
/**
* exit_oom_victim - note the exit of an OOM victim
*/
-void exit_oom_victim(struct task_struct *tsk)
+void exit_oom_victim(void)
{
- if (!test_and_clear_tsk_thread_flag(tsk, TIF_MEMDIE))
- return;
- atomic_dec(&tsk->signal->oom_victims);
+ clear_thread_flag(TIF_MEMDIE);
if (!atomic_dec_return(&oom_victims))
wake_up_all(&oom_victims_wait);
}
+/**
+ * oom_killer_enable - enable OOM killer
+ */
+void oom_killer_enable(void)
+{
+ oom_killer_disabled = false;
+}
+
/**
* oom_killer_disable - disable OOM killer
+ * @timeout: maximum timeout to wait for oom victims in jiffies
*
* Forces all page allocations to fail rather than trigger OOM killer.
- * Will block and wait until all OOM victims are killed.
+ * Will block and wait until all OOM victims are killed or the given
+ * timeout expires.
*
* The function cannot be called when there are runnable user tasks because
* the userspace would see unexpected allocation failures as a result. Any
* Returns true if successful and false if the OOM killer cannot be
* disabled.
*/
-bool oom_killer_disable(void)
+bool oom_killer_disable(signed long timeout)
{
+ signed long ret;
+
/*
* Make sure to not race with an ongoing OOM killer. Check that the
* current is not killed (possibly due to sharing the victim's memory).
oom_killer_disabled = true;
mutex_unlock(&oom_lock);
- wait_event(oom_victims_wait, !atomic_read(&oom_victims));
+ ret = wait_event_interruptible_timeout(oom_victims_wait,
+ !atomic_read(&oom_victims), timeout);
+ if (ret <= 0) {
+ oom_killer_enable();
+ return false;
+ }
return true;
}
-/**
- * oom_killer_enable - enable OOM killer
- */
-void oom_killer_enable(void)
-{
- oom_killer_disabled = false;
-}
-
static inline bool __task_will_free_mem(struct task_struct *task)
{
struct signal_struct *sig = task->signal;
* Caller has to make sure that task->mm is stable (hold task_lock or
* it operates on the current).
*/
-bool task_will_free_mem(struct task_struct *task)
+static bool task_will_free_mem(struct task_struct *task)
{
struct mm_struct *mm = task->mm;
struct task_struct *p;
* This task has already been drained by the oom reaper so there are
* only small chances it will free some more
*/
- if (test_bit(MMF_OOM_REAPED, &mm->flags))
+ if (test_bit(MMF_OOM_SKIP, &mm->flags))
return false;
if (atomic_read(&mm->mm_users) <= 1)
return true;
/*
- * This is really pessimistic but we do not have any reliable way
- * to check that external processes share with our mm
+ * Make sure that all tasks which share the mm with the given tasks
+ * are dying as well to make sure that a) nobody pins its mm and
+ * b) the task is also reapable by the oom reaper.
*/
rcu_read_lock();
for_each_process(p) {
return ret;
}
-/*
- * Must be called while holding a reference to p, which will be released upon
- * returning.
- */
-void oom_kill_process(struct oom_control *oc, struct task_struct *p,
- unsigned int points, unsigned long totalpages,
- const char *message)
+static void oom_kill_process(struct oom_control *oc, const char *message)
{
+ struct task_struct *p = oc->chosen;
+ unsigned int points = oc->chosen_points;
struct task_struct *victim = p;
struct task_struct *child;
struct task_struct *t;
* oom_badness() returns 0 if the thread is unkillable
*/
child_points = oom_badness(child,
- oc->memcg, oc->nodemask, totalpages);
+ oc->memcg, oc->nodemask, oc->totalpages);
if (child_points > victim_points) {
put_task_struct(victim);
victim = child;
continue;
if (same_thread_group(p, victim))
continue;
- if (unlikely(p->flags & PF_KTHREAD) || is_global_init(p)) {
- /*
- * We cannot use oom_reaper for the mm shared by this
- * process because it wouldn't get killed and so the
- * memory might be still used. Hide the mm from the oom
- * killer to guarantee OOM forward progress.
- */
+ if (is_global_init(p)) {
can_oom_reap = false;
- set_bit(MMF_OOM_REAPED, &mm->flags);
+ set_bit(MMF_OOM_SKIP, &mm->flags);
pr_info("oom killer %d (%s) has mm pinned by %d (%s)\n",
task_pid_nr(victim), victim->comm,
task_pid_nr(p), p->comm);
continue;
}
+ /*
+ * No use_mm() user needs to read from the userspace so we are
+ * ok to reap it.
+ */
+ if (unlikely(p->flags & PF_KTHREAD))
+ continue;
do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true);
}
rcu_read_unlock();
/*
* Determines whether the kernel must panic because of the panic_on_oom sysctl.
*/
-void check_panic_on_oom(struct oom_control *oc, enum oom_constraint constraint)
+static void check_panic_on_oom(struct oom_control *oc,
+ enum oom_constraint constraint)
{
if (likely(!sysctl_panic_on_oom))
return;
*/
bool out_of_memory(struct oom_control *oc)
{
- struct task_struct *p;
- unsigned long totalpages;
unsigned long freed = 0;
- unsigned int uninitialized_var(points);
enum oom_constraint constraint = CONSTRAINT_NONE;
if (oom_killer_disabled)
return false;
- blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
- if (freed > 0)
- /* Got some memory back in the last second. */
- return true;
+ if (!is_memcg_oom(oc)) {
+ blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
+ if (freed > 0)
+ /* Got some memory back in the last second. */
+ return true;
+ }
/*
* If current has a pending SIGKILL or is exiting, then automatically
/*
* Check if there were limitations on the allocation (only relevant for
- * NUMA) that may require different handling.
+ * NUMA and memcg) that may require different handling.
*/
- constraint = constrained_alloc(oc, &totalpages);
+ constraint = constrained_alloc(oc);
if (constraint != CONSTRAINT_MEMORY_POLICY)
oc->nodemask = NULL;
check_panic_on_oom(oc, constraint);
- if (sysctl_oom_kill_allocating_task && current->mm &&
- !oom_unkillable_task(current, NULL, oc->nodemask) &&
+ if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task &&
+ current->mm && !oom_unkillable_task(current, NULL, oc->nodemask) &&
current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
get_task_struct(current);
- oom_kill_process(oc, current, 0, totalpages,
- "Out of memory (oom_kill_allocating_task)");
+ oc->chosen = current;
+ oom_kill_process(oc, "Out of memory (oom_kill_allocating_task)");
return true;
}
- p = select_bad_process(oc, &points, totalpages);
+ select_bad_process(oc);
/* Found nothing?!?! Either we hang forever, or we panic. */
- if (!p && !is_sysrq_oom(oc)) {
+ if (!oc->chosen && !is_sysrq_oom(oc) && !is_memcg_oom(oc)) {
dump_header(oc, NULL);
panic("Out of memory and no killable processes...\n");
}
- if (p && p != (void *)-1UL) {
- oom_kill_process(oc, p, points, totalpages, "Out of memory");
+ if (oc->chosen && oc->chosen != (void *)-1UL) {
+ oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" :
+ "Memory cgroup out of memory");
/*
* Give the killed process a good chance to exit before trying
* to allocate memory again.
*/
schedule_timeout_killable(1);
}
- return true;
+ return !!oc->chosen;
}
/*
if (!mutex_trylock(&oom_lock))
return;
-
- if (!out_of_memory(&oc)) {
- /*
- * There shouldn't be any user tasks runnable while the
- * OOM killer is disabled, so the current task has to
- * be a racing OOM victim for which oom_killer_disable()
- * is waiting for.
- */
- WARN_ON(test_thread_flag(TIF_MEMDIE));
- }
-
+ out_of_memory(&oc);
mutex_unlock(&oom_lock);
}
return false;
}
-void throttle_vm_writeout(gfp_t gfp_mask)
-{
- unsigned long background_thresh;
- unsigned long dirty_thresh;
-
- for ( ; ; ) {
- global_dirty_limits(&background_thresh, &dirty_thresh);
- dirty_thresh = hard_dirty_limit(&global_wb_domain, dirty_thresh);
-
- /*
- * Boost the allowable dirty threshold a bit for page
- * allocators so they don't get DoS'ed by heavy writers
- */
- dirty_thresh += dirty_thresh / 10; /* wheeee... */
-
- if (global_node_page_state(NR_UNSTABLE_NFS) +
- global_node_page_state(NR_WRITEBACK) <= dirty_thresh)
- break;
- congestion_wait(BLK_RW_ASYNC, HZ/10);
-
- /*
- * The caller might hold locks which can prevent IO completion
- * or progress in the filesystem. So we cannot just sit here
- * waiting for IO to complete.
- */
- if ((gfp_mask & (__GFP_FS|__GFP_IO)) != (__GFP_FS|__GFP_IO))
- break;
- }
-}
-
/*
* sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
*/
int ret;
lock_page_memcg(page);
- if (mapping) {
+ if (mapping && mapping_use_writeback_tags(mapping)) {
struct inode *inode = mapping->host;
struct backing_dev_info *bdi = inode_to_bdi(inode);
unsigned long flags;
int ret;
lock_page_memcg(page);
- if (mapping) {
+ if (mapping && mapping_use_writeback_tags(mapping)) {
struct inode *inode = mapping->host;
struct backing_dev_info *bdi = inode_to_bdi(inode);
unsigned long flags;
static unsigned long __meminitdata nr_kernel_pages;
static unsigned long __meminitdata nr_all_pages;
-static unsigned long __meminitdata dma_reserve;
+static unsigned long __meminitdata nr_memory_reserve;
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
if (!debug_pagealloc_enabled())
return false;
+ if (!debug_guardpage_minorder())
+ return false;
+
return true;
}
if (!debug_pagealloc_enabled())
return;
+ if (!debug_guardpage_minorder())
+ return;
+
_debug_guardpage_enabled = true;
}
pr_info("Setting debug_guardpage_minorder to %lu\n", res);
return 0;
}
-__setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup);
+early_param("debug_guardpage_minorder", debug_guardpage_minorder_setup);
-static inline void set_page_guard(struct zone *zone, struct page *page,
+static inline bool set_page_guard(struct zone *zone, struct page *page,
unsigned int order, int migratetype)
{
struct page_ext *page_ext;
if (!debug_guardpage_enabled())
- return;
+ return false;
+
+ if (order >= debug_guardpage_minorder())
+ return false;
page_ext = lookup_page_ext(page);
if (unlikely(!page_ext))
- return;
+ return false;
__set_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags);
set_page_private(page, order);
/* Guard pages are not available for any usage */
__mod_zone_freepage_state(zone, -(1 << order), migratetype);
+
+ return true;
}
static inline void clear_page_guard(struct zone *zone, struct page *page,
__mod_zone_freepage_state(zone, (1 << order), migratetype);
}
#else
-struct page_ext_operations debug_guardpage_ops = { NULL, };
-static inline void set_page_guard(struct zone *zone, struct page *page,
- unsigned int order, int migratetype) {}
+struct page_ext_operations debug_guardpage_ops;
+static inline bool set_page_guard(struct zone *zone, struct page *page,
+ unsigned int order, int migratetype) { return false; }
static inline void clear_page_guard(struct zone *zone, struct page *page,
unsigned int order, int migratetype) {}
#endif
return;
/* Free a large naturally-aligned chunk if possible */
- if (nr_pages == MAX_ORDER_NR_PAGES &&
- (pfn & (MAX_ORDER_NR_PAGES-1)) == 0) {
+ if (nr_pages == pageblock_nr_pages &&
+ (pfn & (pageblock_nr_pages - 1)) == 0) {
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
- __free_pages_boot_core(page, MAX_ORDER-1);
+ __free_pages_boot_core(page, pageblock_order);
return;
}
- for (i = 0; i < nr_pages; i++, page++)
+ for (i = 0; i < nr_pages; i++, page++, pfn++) {
+ if ((pfn & (pageblock_nr_pages - 1)) == 0)
+ set_pageblock_migratetype(page, MIGRATE_MOVABLE);
__free_pages_boot_core(page, 0);
+ }
}
/* Completion tracking for deferred_init_memmap() threads */
/*
* Ensure pfn_valid is checked every
- * MAX_ORDER_NR_PAGES for memory holes
+ * pageblock_nr_pages for memory holes
*/
- if ((pfn & (MAX_ORDER_NR_PAGES - 1)) == 0) {
+ if ((pfn & (pageblock_nr_pages - 1)) == 0) {
if (!pfn_valid(pfn)) {
page = NULL;
goto free_range;
}
/* Minimise pfn page lookups and scheduler checks */
- if (page && (pfn & (MAX_ORDER_NR_PAGES - 1)) != 0) {
+ if (page && (pfn & (pageblock_nr_pages - 1)) != 0) {
page++;
} else {
nr_pages += nr_to_free;
free_base_page = NULL;
free_base_pfn = nr_to_free = 0;
}
+ /* Free the last block of pages to allocator */
+ nr_pages += nr_to_free;
+ deferred_free_range(free_base_page, free_base_pfn, nr_to_free);
first_init_pfn = max(end_pfn, first_init_pfn);
}
size >>= 1;
VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]);
- if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC) &&
- debug_guardpage_enabled() &&
- high < debug_guardpage_minorder()) {
- /*
- * Mark as guard pages (or page), that will allow to
- * merge back to allocator when buddy will be freed.
- * Corresponding page table entries will not be touched,
- * pages will stay not present in virtual address space
- */
- set_page_guard(zone, &page[size], high, migratetype);
+ /*
+ * Mark as guard pages (or page), that will allow to
+ * merge back to allocator when buddy will be freed.
+ * Corresponding page table entries will not be touched,
+ * pages will stay not present in virtual address space
+ */
+ if (set_page_guard(zone, &page[size], high, migratetype))
continue;
- }
+
list_add(&page[size].lru, &area->free_list[migratetype]);
area->nr_free++;
set_page_order(&page[size], high);
mt = get_pageblock_migratetype(page);
if (!is_migrate_isolate(mt)) {
- /* Obey watermarks as if the page was being allocated */
- watermark = low_wmark_pages(zone) + (1 << order);
- if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
+ /*
+ * Obey watermarks as if the page was being allocated. We can
+ * emulate a high-order watermark check with a raised order-0
+ * watermark, because we already know our high-order page
+ * exists.
+ */
+ watermark = min_wmark_pages(zone) + (1UL << order);
+ if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA))
return 0;
__mod_zone_freepage_state(zone, -(1UL << order), mt);
return NULL;
}
+static inline bool
+should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
+ enum compact_result compact_result,
+ enum compact_priority *compact_priority,
+ int compaction_retries)
+{
+ int max_retries = MAX_COMPACT_RETRIES;
+ int min_priority;
+
+ if (!order)
+ return false;
+
+ /*
+ * compaction considers all the zone as desperately out of memory
+ * so it doesn't really make much sense to retry except when the
+ * failure could be caused by insufficient priority
+ */
+ if (compaction_failed(compact_result))
+ goto check_priority;
+
+ /*
+ * make sure the compaction wasn't deferred or didn't bail out early
+ * due to locks contention before we declare that we should give up.
+ * But do not retry if the given zonelist is not suitable for
+ * compaction.
+ */
+ if (compaction_withdrawn(compact_result))
+ return compaction_zonelist_suitable(ac, order, alloc_flags);
+
+ /*
+ * !costly requests are much more important than __GFP_REPEAT
+ * costly ones because they are de facto nofail and invoke OOM
+ * killer to move on while costly can fail and users are ready
+ * to cope with that. 1/4 retries is rather arbitrary but we
+ * would need much more detailed feedback from compaction to
+ * make a better decision.
+ */
+ if (order > PAGE_ALLOC_COSTLY_ORDER)
+ max_retries /= 4;
+ if (compaction_retries <= max_retries)
+ return true;
+
+ /*
+ * Make sure there is at least one attempt at the highest priority
+ * if we exhausted all retries at the lower priorities
+ */
+check_priority:
+ min_priority = (order > PAGE_ALLOC_COSTLY_ORDER) ?
+ MIN_COMPACT_COSTLY_PRIORITY : MIN_COMPACT_PRIORITY;
+ if (*compact_priority > min_priority) {
+ (*compact_priority)--;
+ return true;
+ }
+ return false;
+}
#else
static inline struct page *
__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
return NULL;
}
-#endif /* CONFIG_COMPACTION */
-
static inline bool
should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags,
enum compact_result compact_result,
}
return false;
}
+#endif /* CONFIG_COMPACTION */
/* Perform direct synchronous page reclaim */
static int
int j;
struct zonelist *zonelist;
- zonelist = &pgdat->node_zonelists[0];
+ zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK];
for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)
;
j = build_zonelists_node(NODE_DATA(node), zonelist, j);
int j;
struct zonelist *zonelist;
- zonelist = &pgdat->node_zonelists[1];
+ zonelist = &pgdat->node_zonelists[ZONELIST_NOFALLBACK];
j = build_zonelists_node(pgdat, zonelist, 0);
zonelist->_zonerefs[j].zone = NULL;
zonelist->_zonerefs[j].zone_idx = 0;
struct zone *z;
struct zonelist *zonelist;
- zonelist = &pgdat->node_zonelists[0];
+ zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK];
pos = 0;
for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {
for (j = 0; j < nr_nodes; j++) {
local_node = pgdat->node_id;
- zonelist = &pgdat->node_zonelists[0];
+ zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK];
j = build_zonelists_node(pgdat, zonelist, 0);
/*
break;
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
- /*
- * If not mirrored_kernelcore and ZONE_MOVABLE exists, range
- * from zone_movable_pfn[nid] to end of each node should be
- * ZONE_MOVABLE not ZONE_NORMAL. skip it.
- */
- if (!mirrored_kernelcore && zone_movable_pfn[nid])
- if (zone == ZONE_NORMAL && pfn >= zone_movable_pfn[nid])
- continue;
-
/*
* Check given memblock attribute by firmware which can affect
* kernel memory layout. If zone==ZONE_MOVABLE but memory is
*zone_end_pfn = min(node_end_pfn,
arch_zone_highest_possible_pfn[movable_zone]);
+ /* Adjust for ZONE_MOVABLE starting within this range */
+ } else if (!mirrored_kernelcore &&
+ *zone_start_pfn < zone_movable_pfn[nid] &&
+ *zone_end_pfn > zone_movable_pfn[nid]) {
+ *zone_end_pfn = zone_movable_pfn[nid];
+
/* Check if this whole range is within ZONE_MOVABLE */
} else if (*zone_start_pfn >= zone_movable_pfn[nid])
*zone_start_pfn = *zone_end_pfn;
* Treat pages to be ZONE_MOVABLE in ZONE_NORMAL as absent pages
* and vice versa.
*/
- if (zone_movable_pfn[nid]) {
- if (mirrored_kernelcore) {
- unsigned long start_pfn, end_pfn;
- struct memblock_region *r;
-
- for_each_memblock(memory, r) {
- start_pfn = clamp(memblock_region_memory_base_pfn(r),
- zone_start_pfn, zone_end_pfn);
- end_pfn = clamp(memblock_region_memory_end_pfn(r),
- zone_start_pfn, zone_end_pfn);
-
- if (zone_type == ZONE_MOVABLE &&
- memblock_is_mirror(r))
- nr_absent += end_pfn - start_pfn;
-
- if (zone_type == ZONE_NORMAL &&
- !memblock_is_mirror(r))
- nr_absent += end_pfn - start_pfn;
- }
- } else {
- if (zone_type == ZONE_NORMAL)
- nr_absent += node_end_pfn - zone_movable_pfn[nid];
+ if (mirrored_kernelcore && zone_movable_pfn[nid]) {
+ unsigned long start_pfn, end_pfn;
+ struct memblock_region *r;
+
+ for_each_memblock(memory, r) {
+ start_pfn = clamp(memblock_region_memory_base_pfn(r),
+ zone_start_pfn, zone_end_pfn);
+ end_pfn = clamp(memblock_region_memory_end_pfn(r),
+ zone_start_pfn, zone_end_pfn);
+
+ if (zone_type == ZONE_MOVABLE &&
+ memblock_is_mirror(r))
+ nr_absent += end_pfn - start_pfn;
+
+ if (zone_type == ZONE_NORMAL &&
+ !memblock_is_mirror(r))
+ nr_absent += end_pfn - start_pfn;
}
}
}
/* Account for reserved pages */
- if (j == 0 && freesize > dma_reserve) {
- freesize -= dma_reserve;
+ if (j == 0 && freesize > nr_memory_reserve) {
+ freesize -= nr_memory_reserve;
printk(KERN_DEBUG " %s zone: %lu pages reserved\n",
- zone_names[0], dma_reserve);
+ zone_names[0], nr_memory_reserve);
}
if (!is_highmem_idx(j))
}
/**
- * set_dma_reserve - set the specified number of pages reserved in the first zone
- * @new_dma_reserve: The number of pages to mark reserved
+ * set_memory_reserve - set number of pages reserved in the first zone
+ * @nr_reserve: The number of pages to mark reserved
+ * @inc: true increment to existing value; false set new value.
*
* The per-cpu batchsize and zone watermarks are determined by managed_pages.
* In the DMA zone, a significant percentage may be consumed by kernel image
* first zone (e.g., ZONE_DMA). The effect will be lower watermarks and
* smaller per-cpu batchsize.
*/
-void __init set_dma_reserve(unsigned long new_dma_reserve)
+void __init set_memory_reserve(unsigned long nr_reserve, bool inc)
{
- dma_reserve = new_dma_reserve;
+ if (inc)
+ nr_memory_reserve += nr_reserve;
+ else
+ nr_memory_reserve = nr_reserve;
}
void __init free_area_init(unsigned long *zones_size)
__setup("hashdist=", set_hashdist);
#endif
+#ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES
+/*
+ * Returns the number of pages that arch has reserved but
+ * is not known to alloc_large_system_hash().
+ */
+static unsigned long __init arch_reserved_kernel_pages(void)
+{
+ return 0;
+}
+#endif
+
/*
* allocate a large system hash table from bootmem
* - it is assumed that the hash table must contain an exact power-of-2
if (!numentries) {
/* round applicable memory size up to nearest megabyte */
numentries = nr_kernel_pages;
+ numentries -= arch_reserved_kernel_pages();
/* It isn't necessary when PAGE_SIZE >= 1MB */
if (PAGE_SHIFT < 20)
* and page extension core can skip to allocate memory. As result,
* none of memory is wasted.
*
+ * When need callback returns true, page_ext checks if there is a request for
+ * extra memory through size in struct page_ext_operations. If it is non-zero,
+ * extra space is allocated for each page_ext entry and offset is returned to
+ * user through offset in struct page_ext_operations.
+ *
* The init callback is used to do proper initialization after page extension
* is completely initialized. In sparse memory system, extra memory is
* allocated some time later than memmap is allocated. In other words, lifetime
};
static unsigned long total_usage;
+static unsigned long extra_mem;
static bool __init invoke_need_callbacks(void)
{
int i;
int entries = ARRAY_SIZE(page_ext_ops);
+ bool need = false;
for (i = 0; i < entries; i++) {
- if (page_ext_ops[i]->need && page_ext_ops[i]->need())
- return true;
+ if (page_ext_ops[i]->need && page_ext_ops[i]->need()) {
+ page_ext_ops[i]->offset = sizeof(struct page_ext) +
+ extra_mem;
+ extra_mem += page_ext_ops[i]->size;
+ need = true;
+ }
}
- return false;
+ return need;
}
static void __init invoke_init_callbacks(void)
}
}
+static unsigned long get_entry_size(void)
+{
+ return sizeof(struct page_ext) + extra_mem;
+}
+
+static inline struct page_ext *get_entry(void *base, unsigned long index)
+{
+ return base + get_entry_size() * index;
+}
+
#if !defined(CONFIG_SPARSEMEM)
struct page_ext *lookup_page_ext(struct page *page)
{
unsigned long pfn = page_to_pfn(page);
- unsigned long offset;
+ unsigned long index;
struct page_ext *base;
base = NODE_DATA(page_to_nid(page))->node_page_ext;
if (unlikely(!base))
return NULL;
#endif
- offset = pfn - round_down(node_start_pfn(page_to_nid(page)),
+ index = pfn - round_down(node_start_pfn(page_to_nid(page)),
MAX_ORDER_NR_PAGES);
- return base + offset;
+ return get_entry(base, index);
}
static int __init alloc_node_page_ext(int nid)
!IS_ALIGNED(node_end_pfn(nid), MAX_ORDER_NR_PAGES))
nr_pages += MAX_ORDER_NR_PAGES;
- table_size = sizeof(struct page_ext) * nr_pages;
+ table_size = get_entry_size() * nr_pages;
base = memblock_virt_alloc_try_nid_nopanic(
table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
if (!section->page_ext)
return NULL;
#endif
- return section->page_ext + pfn;
+ return get_entry(section->page_ext, pfn);
}
static void *__meminit alloc_page_ext(size_t size, int nid)
if (section->page_ext)
return 0;
- table_size = sizeof(struct page_ext) * PAGES_PER_SECTION;
+ table_size = get_entry_size() * PAGES_PER_SECTION;
base = alloc_page_ext(table_size, nid);
/*
* we need to apply a mask.
*/
pfn &= PAGE_SECTION_MASK;
- section->page_ext = base - pfn;
+ section->page_ext = (void *)base - get_entry_size() * pfn;
total_usage += table_size;
return 0;
}
struct page *page = virt_to_page(addr);
size_t table_size;
- table_size = sizeof(struct page_ext) * PAGES_PER_SECTION;
+ table_size = get_entry_size() * PAGES_PER_SECTION;
BUG_ON(PageReserved(page));
free_pages_exact(addr, table_size);
ms = __pfn_to_section(pfn);
if (!ms || !ms->page_ext)
return;
- base = ms->page_ext + pfn;
+ base = get_entry(ms->page_ext, pfn);
free_page_ext(base);
ms->page_ext = NULL;
}
#include <linux/jump_label.h>
#include <linux/migrate.h>
#include <linux/stackdepot.h>
+#include <linux/seq_file.h>
#include "internal.h"
*/
#define PAGE_OWNER_STACK_DEPTH (16)
+struct page_owner {
+ unsigned int order;
+ gfp_t gfp_mask;
+ int last_migrate_reason;
+ depot_stack_handle_t handle;
+};
+
static bool page_owner_disabled = true;
DEFINE_STATIC_KEY_FALSE(page_owner_inited);
}
struct page_ext_operations page_owner_ops = {
+ .size = sizeof(struct page_owner),
.need = need_page_owner,
.init = init_page_owner,
};
+static inline struct page_owner *get_page_owner(struct page_ext *page_ext)
+{
+ return (void *)page_ext + page_owner_ops.offset;
+}
+
void __reset_page_owner(struct page *page, unsigned int order)
{
int i;
gfp_t gfp_mask)
{
struct page_ext *page_ext = lookup_page_ext(page);
+ struct page_owner *page_owner;
if (unlikely(!page_ext))
return;
- page_ext->handle = save_stack(gfp_mask);
- page_ext->order = order;
- page_ext->gfp_mask = gfp_mask;
- page_ext->last_migrate_reason = -1;
+ page_owner = get_page_owner(page_ext);
+ page_owner->handle = save_stack(gfp_mask);
+ page_owner->order = order;
+ page_owner->gfp_mask = gfp_mask;
+ page_owner->last_migrate_reason = -1;
__set_bit(PAGE_EXT_OWNER, &page_ext->flags);
}
void __set_page_owner_migrate_reason(struct page *page, int reason)
{
struct page_ext *page_ext = lookup_page_ext(page);
+ struct page_owner *page_owner;
+
if (unlikely(!page_ext))
return;
- page_ext->last_migrate_reason = reason;
+ page_owner = get_page_owner(page_ext);
+ page_owner->last_migrate_reason = reason;
}
void __split_page_owner(struct page *page, unsigned int order)
{
int i;
struct page_ext *page_ext = lookup_page_ext(page);
+ struct page_owner *page_owner;
if (unlikely(!page_ext))
return;
- page_ext->order = 0;
+ page_owner = get_page_owner(page_ext);
+ page_owner->order = 0;
for (i = 1; i < (1 << order); i++)
__copy_page_owner(page, page + i);
}
{
struct page_ext *old_ext = lookup_page_ext(oldpage);
struct page_ext *new_ext = lookup_page_ext(newpage);
+ struct page_owner *old_page_owner, *new_page_owner;
if (unlikely(!old_ext || !new_ext))
return;
- new_ext->order = old_ext->order;
- new_ext->gfp_mask = old_ext->gfp_mask;
- new_ext->last_migrate_reason = old_ext->last_migrate_reason;
- new_ext->handle = old_ext->handle;
+ old_page_owner = get_page_owner(old_ext);
+ new_page_owner = get_page_owner(new_ext);
+ new_page_owner->order = old_page_owner->order;
+ new_page_owner->gfp_mask = old_page_owner->gfp_mask;
+ new_page_owner->last_migrate_reason =
+ old_page_owner->last_migrate_reason;
+ new_page_owner->handle = old_page_owner->handle;
/*
* We don't clear the bit on the oldpage as it's going to be freed
__set_bit(PAGE_EXT_OWNER, &new_ext->flags);
}
+void pagetypeinfo_showmixedcount_print(struct seq_file *m,
+ pg_data_t *pgdat, struct zone *zone)
+{
+ struct page *page;
+ struct page_ext *page_ext;
+ struct page_owner *page_owner;
+ unsigned long pfn = zone->zone_start_pfn, block_end_pfn;
+ unsigned long end_pfn = pfn + zone->spanned_pages;
+ unsigned long count[MIGRATE_TYPES] = { 0, };
+ int pageblock_mt, page_mt;
+ int i;
+
+ /* Scan block by block. First and last block may be incomplete */
+ pfn = zone->zone_start_pfn;
+
+ /*
+ * Walk the zone in pageblock_nr_pages steps. If a page block spans
+ * a zone boundary, it will be double counted between zones. This does
+ * not matter as the mixed block count will still be correct
+ */
+ for (; pfn < end_pfn; ) {
+ if (!pfn_valid(pfn)) {
+ pfn = ALIGN(pfn + 1, pageblock_nr_pages);
+ continue;
+ }
+
+ block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
+ block_end_pfn = min(block_end_pfn, end_pfn);
+
+ page = pfn_to_page(pfn);
+ pageblock_mt = get_pageblock_migratetype(page);
+
+ for (; pfn < block_end_pfn; pfn++) {
+ if (!pfn_valid_within(pfn))
+ continue;
+
+ page = pfn_to_page(pfn);
+
+ if (page_zone(page) != zone)
+ continue;
+
+ if (PageBuddy(page)) {
+ pfn += (1UL << page_order(page)) - 1;
+ continue;
+ }
+
+ if (PageReserved(page))
+ continue;
+
+ page_ext = lookup_page_ext(page);
+ if (unlikely(!page_ext))
+ continue;
+
+ if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags))
+ continue;
+
+ page_owner = get_page_owner(page_ext);
+ page_mt = gfpflags_to_migratetype(
+ page_owner->gfp_mask);
+ if (pageblock_mt != page_mt) {
+ if (is_migrate_cma(pageblock_mt))
+ count[MIGRATE_MOVABLE]++;
+ else
+ count[pageblock_mt]++;
+
+ pfn = block_end_pfn;
+ break;
+ }
+ pfn += (1UL << page_owner->order) - 1;
+ }
+ }
+
+ /* Print counts */
+ seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
+ for (i = 0; i < MIGRATE_TYPES; i++)
+ seq_printf(m, "%12lu ", count[i]);
+ seq_putc(m, '\n');
+}
+
static ssize_t
print_page_owner(char __user *buf, size_t count, unsigned long pfn,
- struct page *page, struct page_ext *page_ext,
+ struct page *page, struct page_owner *page_owner,
depot_stack_handle_t handle)
{
int ret;
ret = snprintf(kbuf, count,
"Page allocated via order %u, mask %#x(%pGg)\n",
- page_ext->order, page_ext->gfp_mask,
- &page_ext->gfp_mask);
+ page_owner->order, page_owner->gfp_mask,
+ &page_owner->gfp_mask);
if (ret >= count)
goto err;
/* Print information relevant to grouping pages by mobility */
pageblock_mt = get_pageblock_migratetype(page);
- page_mt = gfpflags_to_migratetype(page_ext->gfp_mask);
+ page_mt = gfpflags_to_migratetype(page_owner->gfp_mask);
ret += snprintf(kbuf + ret, count - ret,
"PFN %lu type %s Block %lu type %s Flags %#lx(%pGp)\n",
pfn,
if (ret >= count)
goto err;
- if (page_ext->last_migrate_reason != -1) {
+ if (page_owner->last_migrate_reason != -1) {
ret += snprintf(kbuf + ret, count - ret,
"Page has been migrated, last migrate reason: %s\n",
- migrate_reason_names[page_ext->last_migrate_reason]);
+ migrate_reason_names[page_owner->last_migrate_reason]);
if (ret >= count)
goto err;
}
void __dump_page_owner(struct page *page)
{
struct page_ext *page_ext = lookup_page_ext(page);
+ struct page_owner *page_owner;
unsigned long entries[PAGE_OWNER_STACK_DEPTH];
struct stack_trace trace = {
.nr_entries = 0,
pr_alert("There is not page extension available.\n");
return;
}
- gfp_mask = page_ext->gfp_mask;
+
+ page_owner = get_page_owner(page_ext);
+ gfp_mask = page_owner->gfp_mask;
mt = gfpflags_to_migratetype(gfp_mask);
if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) {
return;
}
- handle = READ_ONCE(page_ext->handle);
+ handle = READ_ONCE(page_owner->handle);
if (!handle) {
pr_alert("page_owner info is not active (free page?)\n");
return;
depot_fetch_stack(handle, &trace);
pr_alert("page allocated via order %u, migratetype %s, gfp_mask %#x(%pGg)\n",
- page_ext->order, migratetype_names[mt], gfp_mask, &gfp_mask);
+ page_owner->order, migratetype_names[mt], gfp_mask, &gfp_mask);
print_stack_trace(&trace, 0);
- if (page_ext->last_migrate_reason != -1)
+ if (page_owner->last_migrate_reason != -1)
pr_alert("page has been migrated, last migrate reason: %s\n",
- migrate_reason_names[page_ext->last_migrate_reason]);
+ migrate_reason_names[page_owner->last_migrate_reason]);
}
static ssize_t
unsigned long pfn;
struct page *page;
struct page_ext *page_ext;
+ struct page_owner *page_owner;
depot_stack_handle_t handle;
if (!static_branch_unlikely(&page_owner_inited))
if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags))
continue;
+ page_owner = get_page_owner(page_ext);
+
/*
* Access to page_ext->handle isn't synchronous so we should
* be careful to access it.
*/
- handle = READ_ONCE(page_ext->handle);
+ handle = READ_ONCE(page_owner->handle);
if (!handle)
continue;
*ppos = (pfn - min_low_pfn) + 1;
return print_page_owner(buf, count, pfn, page,
- page_ext, handle);
+ page_owner, handle);
}
return 0;
*/
for (; pfn < end_pfn; ) {
if (!pfn_valid(pfn)) {
- pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES);
+ pfn = ALIGN(pfn + 1, pageblock_nr_pages);
continue;
}
spin_lock_init(&parent->list_lock);
parent->free_objects = 0;
parent->free_touched = 0;
+ parent->num_slabs = 0;
}
#define MAKE_LIST(cachep, listp, slab, nodeid) \
for_each_kmem_cache_node(cachep, node, n) {
unsigned long active_objs = 0, num_objs = 0, free_objects = 0;
unsigned long active_slabs = 0, num_slabs = 0;
+ unsigned long num_slabs_partial = 0, num_slabs_free = 0;
+ unsigned long num_slabs_full;
spin_lock_irqsave(&n->list_lock, flags);
- list_for_each_entry(page, &n->slabs_full, lru) {
- active_objs += cachep->num;
- active_slabs++;
- }
+ num_slabs = n->num_slabs;
list_for_each_entry(page, &n->slabs_partial, lru) {
active_objs += page->active;
- active_slabs++;
+ num_slabs_partial++;
}
list_for_each_entry(page, &n->slabs_free, lru)
- num_slabs++;
+ num_slabs_free++;
free_objects += n->free_objects;
spin_unlock_irqrestore(&n->list_lock, flags);
- num_slabs += active_slabs;
num_objs = num_slabs * cachep->num;
+ active_slabs = num_slabs - num_slabs_free;
+ num_slabs_full = num_slabs -
+ (num_slabs_partial + num_slabs_free);
+ active_objs += (num_slabs_full * cachep->num);
+
pr_warn(" node %d: slabs: %ld/%ld, objs: %ld/%ld, free: %ld\n",
node, active_slabs, num_slabs, active_objs, num_objs,
free_objects);
page = list_entry(p, struct page, lru);
list_del(&page->lru);
+ n->num_slabs--;
/*
* Safe to drop the lock. The slab is no longer linked
* to the cache.
list_add_tail(&page->lru, &(n->slabs_free));
else
fixup_slab_list(cachep, n, page, &list);
+
+ n->num_slabs++;
STATS_INC_GROWN(cachep);
n->free_objects += cachep->num - page->active;
spin_unlock(&n->list_lock);
page = list_last_entry(&n->slabs_free, struct page, lru);
list_move(&page->lru, list);
+ n->num_slabs--;
}
}
unsigned long num_objs;
unsigned long active_slabs = 0;
unsigned long num_slabs, free_objects = 0, shared_avail = 0;
+ unsigned long num_slabs_partial = 0, num_slabs_free = 0;
+ unsigned long num_slabs_full = 0;
const char *name;
char *error = NULL;
int node;
check_irq_on();
spin_lock_irq(&n->list_lock);
- list_for_each_entry(page, &n->slabs_full, lru) {
- if (page->active != cachep->num && !error)
- error = "slabs_full accounting error";
- active_objs += cachep->num;
- active_slabs++;
- }
+ num_slabs += n->num_slabs;
+
list_for_each_entry(page, &n->slabs_partial, lru) {
if (page->active == cachep->num && !error)
error = "slabs_partial accounting error";
if (!page->active && !error)
error = "slabs_partial accounting error";
active_objs += page->active;
- active_slabs++;
+ num_slabs_partial++;
}
+
list_for_each_entry(page, &n->slabs_free, lru) {
if (page->active && !error)
error = "slabs_free accounting error";
- num_slabs++;
+ num_slabs_free++;
}
+
free_objects += n->free_objects;
if (n->shared)
shared_avail += n->shared->avail;
spin_unlock_irq(&n->list_lock);
}
- num_slabs += active_slabs;
num_objs = num_slabs * cachep->num;
+ active_slabs = num_slabs - num_slabs_free;
+ num_slabs_full = num_slabs - (num_slabs_partial + num_slabs_free);
+ active_objs += (num_slabs_full * cachep->num);
+
if (num_objs - active_objs != free_objects && !error)
error = "free_objects accounting error";
struct list_head slabs_partial; /* partial list first, better asm code */
struct list_head slabs_full;
struct list_head slabs_free;
+ unsigned long num_slabs;
unsigned long free_objects;
unsigned int free_limit;
unsigned int colour_next; /* Per-node cache coloring */
locked_pgdat = NULL;
}
- if (is_huge_zero_page(page)) {
- put_huge_zero_page();
+ if (is_huge_zero_page(page))
continue;
- }
page = compound_head(page);
if (!put_page_testzero(page))
.page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN),
.i_mmap_writable = ATOMIC_INIT(0),
.a_ops = &swap_aops,
+ /* swap cache doesn't use writeback related tags */
+ .flags = 1 << AS_NO_WRITEBACK_TAGS,
}
};
address_space = swap_address_space(entry);
spin_lock_irq(&address_space->tree_lock);
error = radix_tree_insert(&address_space->page_tree,
- entry.val, page);
+ swp_offset(entry), page);
if (likely(!error)) {
address_space->nrpages++;
__inc_node_page_state(page, NR_FILE_PAGES);
entry.val = page_private(page);
address_space = swap_address_space(entry);
- radix_tree_delete(&address_space->page_tree, page_private(page));
+ radix_tree_delete(&address_space->page_tree, swp_offset(entry));
set_page_private(page, 0);
ClearPageSwapCache(page);
address_space->nrpages--;
void free_page_and_swap_cache(struct page *page)
{
free_swap_cache(page);
- if (is_huge_zero_page(page))
- put_huge_zero_page();
- else
+ if (!is_huge_zero_page(page))
put_page(page);
}
{
struct page *page;
- page = find_get_page(swap_address_space(entry), entry.val);
+ page = find_get_page(swap_address_space(entry), swp_offset(entry));
if (page) {
INC_CACHE_INFO(find_success);
* called after lookup_swap_cache() failed, re-calling
* that would confuse statistics.
*/
- found_page = find_get_page(swapper_space, entry.val);
+ found_page = find_get_page(swapper_space, swp_offset(entry));
if (found_page)
break;
struct page *page;
int ret = 0;
- page = find_get_page(swap_address_space(entry), entry.val);
+ page = find_get_page(swap_address_space(entry), swp_offset(entry));
if (!page)
return 0;
/*
info->data = 0;
}
+static inline bool cluster_list_empty(struct swap_cluster_list *list)
+{
+ return cluster_is_null(&list->head);
+}
+
+static inline unsigned int cluster_list_first(struct swap_cluster_list *list)
+{
+ return cluster_next(&list->head);
+}
+
+static void cluster_list_init(struct swap_cluster_list *list)
+{
+ cluster_set_null(&list->head);
+ cluster_set_null(&list->tail);
+}
+
+static void cluster_list_add_tail(struct swap_cluster_list *list,
+ struct swap_cluster_info *ci,
+ unsigned int idx)
+{
+ if (cluster_list_empty(list)) {
+ cluster_set_next_flag(&list->head, idx, 0);
+ cluster_set_next_flag(&list->tail, idx, 0);
+ } else {
+ unsigned int tail = cluster_next(&list->tail);
+
+ cluster_set_next(&ci[tail], idx);
+ cluster_set_next_flag(&list->tail, idx, 0);
+ }
+}
+
+static unsigned int cluster_list_del_first(struct swap_cluster_list *list,
+ struct swap_cluster_info *ci)
+{
+ unsigned int idx;
+
+ idx = cluster_next(&list->head);
+ if (cluster_next(&list->tail) == idx) {
+ cluster_set_null(&list->head);
+ cluster_set_null(&list->tail);
+ } else
+ cluster_set_next_flag(&list->head,
+ cluster_next(&ci[idx]), 0);
+
+ return idx;
+}
+
/* Add a cluster to discard list and schedule it to do discard */
static void swap_cluster_schedule_discard(struct swap_info_struct *si,
unsigned int idx)
memset(si->swap_map + idx * SWAPFILE_CLUSTER,
SWAP_MAP_BAD, SWAPFILE_CLUSTER);
- if (cluster_is_null(&si->discard_cluster_head)) {
- cluster_set_next_flag(&si->discard_cluster_head,
- idx, 0);
- cluster_set_next_flag(&si->discard_cluster_tail,
- idx, 0);
- } else {
- unsigned int tail = cluster_next(&si->discard_cluster_tail);
- cluster_set_next(&si->cluster_info[tail], idx);
- cluster_set_next_flag(&si->discard_cluster_tail,
- idx, 0);
- }
+ cluster_list_add_tail(&si->discard_clusters, si->cluster_info, idx);
schedule_work(&si->discard_work);
}
info = si->cluster_info;
- while (!cluster_is_null(&si->discard_cluster_head)) {
- idx = cluster_next(&si->discard_cluster_head);
-
- cluster_set_next_flag(&si->discard_cluster_head,
- cluster_next(&info[idx]), 0);
- if (cluster_next(&si->discard_cluster_tail) == idx) {
- cluster_set_null(&si->discard_cluster_head);
- cluster_set_null(&si->discard_cluster_tail);
- }
+ while (!cluster_list_empty(&si->discard_clusters)) {
+ idx = cluster_list_del_first(&si->discard_clusters, info);
spin_unlock(&si->lock);
discard_swap_cluster(si, idx * SWAPFILE_CLUSTER,
spin_lock(&si->lock);
cluster_set_flag(&info[idx], CLUSTER_FLAG_FREE);
- if (cluster_is_null(&si->free_cluster_head)) {
- cluster_set_next_flag(&si->free_cluster_head,
- idx, 0);
- cluster_set_next_flag(&si->free_cluster_tail,
- idx, 0);
- } else {
- unsigned int tail;
-
- tail = cluster_next(&si->free_cluster_tail);
- cluster_set_next(&info[tail], idx);
- cluster_set_next_flag(&si->free_cluster_tail,
- idx, 0);
- }
+ cluster_list_add_tail(&si->free_clusters, info, idx);
memset(si->swap_map + idx * SWAPFILE_CLUSTER,
0, SWAPFILE_CLUSTER);
}
if (!cluster_info)
return;
if (cluster_is_free(&cluster_info[idx])) {
- VM_BUG_ON(cluster_next(&p->free_cluster_head) != idx);
- cluster_set_next_flag(&p->free_cluster_head,
- cluster_next(&cluster_info[idx]), 0);
- if (cluster_next(&p->free_cluster_tail) == idx) {
- cluster_set_null(&p->free_cluster_tail);
- cluster_set_null(&p->free_cluster_head);
- }
+ VM_BUG_ON(cluster_list_first(&p->free_clusters) != idx);
+ cluster_list_del_first(&p->free_clusters, cluster_info);
cluster_set_count_flag(&cluster_info[idx], 0, 0);
}
}
cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE);
- if (cluster_is_null(&p->free_cluster_head)) {
- cluster_set_next_flag(&p->free_cluster_head, idx, 0);
- cluster_set_next_flag(&p->free_cluster_tail, idx, 0);
- } else {
- unsigned int tail = cluster_next(&p->free_cluster_tail);
- cluster_set_next(&cluster_info[tail], idx);
- cluster_set_next_flag(&p->free_cluster_tail, idx, 0);
- }
+ cluster_list_add_tail(&p->free_clusters, cluster_info, idx);
}
}
bool conflict;
offset /= SWAPFILE_CLUSTER;
- conflict = !cluster_is_null(&si->free_cluster_head) &&
- offset != cluster_next(&si->free_cluster_head) &&
+ conflict = !cluster_list_empty(&si->free_clusters) &&
+ offset != cluster_list_first(&si->free_clusters) &&
cluster_is_free(&si->cluster_info[offset]);
if (!conflict)
new_cluster:
cluster = this_cpu_ptr(si->percpu_cluster);
if (cluster_is_null(&cluster->index)) {
- if (!cluster_is_null(&si->free_cluster_head)) {
- cluster->index = si->free_cluster_head;
+ if (!cluster_list_empty(&si->free_clusters)) {
+ cluster->index = si->free_clusters.head;
cluster->next = cluster_next(&cluster->index) *
SWAPFILE_CLUSTER;
- } else if (!cluster_is_null(&si->discard_cluster_head)) {
+ } else if (!cluster_list_empty(&si->discard_clusters)) {
/*
* we don't have free cluster but have some clusters in
* discarding, do discard now and reclaim them
if (p) {
if (swap_entry_free(p, entry, 1) == SWAP_HAS_CACHE) {
page = find_get_page(swap_address_space(entry),
- entry.val);
+ swp_offset(entry));
if (page && !trylock_page(page)) {
put_page(page);
page = NULL;
nr_good_pages = maxpages - 1; /* omit header page */
- cluster_set_null(&p->free_cluster_head);
- cluster_set_null(&p->free_cluster_tail);
- cluster_set_null(&p->discard_cluster_head);
- cluster_set_null(&p->discard_cluster_tail);
+ cluster_list_init(&p->free_clusters);
+ cluster_list_init(&p->discard_clusters);
for (i = 0; i < swap_header->info.nr_badpages; i++) {
unsigned int page_nr = swap_header->info.badpages[i];
for (i = 0; i < nr_clusters; i++) {
if (!cluster_count(&cluster_info[idx])) {
cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE);
- if (cluster_is_null(&p->free_cluster_head)) {
- cluster_set_next_flag(&p->free_cluster_head,
- idx, 0);
- cluster_set_next_flag(&p->free_cluster_tail,
- idx, 0);
- } else {
- unsigned int tail;
-
- tail = cluster_next(&p->free_cluster_tail);
- cluster_set_next(&cluster_info[tail], idx);
- cluster_set_next_flag(&p->free_cluster_tail,
- idx, 0);
- }
+ cluster_list_add_tail(&p->free_clusters, cluster_info,
+ idx);
}
idx++;
if (idx == nr_clusters)
{
int i;
+ count_vm_vmacache_event(VMACACHE_FIND_CALLS);
+
if (!vmacache_valid(mm))
return NULL;
- count_vm_vmacache_event(VMACACHE_FIND_CALLS);
-
for (i = 0; i < VMACACHE_SIZE; i++) {
struct vm_area_struct *vma = current->vmacache[i];
{
int i;
+ count_vm_vmacache_event(VMACACHE_FIND_CALLS);
+
if (!vmacache_valid(mm))
return NULL;
- count_vm_vmacache_event(VMACACHE_FIND_CALLS);
-
for (i = 0; i < VMACACHE_SIZE; i++) {
struct vm_area_struct *vma = current->vmacache[i];
struct vm_struct *area;
BUG_ON(in_interrupt());
- if (flags & VM_IOREMAP)
- align = 1ul << clamp_t(int, fls_long(size),
- PAGE_SHIFT, IOREMAP_MAX_ORDER);
-
size = PAGE_ALIGN(size);
if (unlikely(!size))
return NULL;
+ if (flags & VM_IOREMAP)
+ align = 1ul << clamp_t(int, get_count_order_long(size),
+ PAGE_SHIFT, IOREMAP_MAX_ORDER);
+
area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
if (unlikely(!area))
return NULL;
if (inactive_list_is_low(lruvec, false, sc))
shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
sc, LRU_ACTIVE_ANON);
-
- throttle_vm_writeout(sc->gfp_mask);
}
/* Use reclaim/compaction for costly allocs or under memory pressure */
* If we have not reclaimed enough pages for compaction and the
* inactive lists are large enough, continue reclaiming
*/
- pages_for_compaction = (2UL << sc->order);
+ pages_for_compaction = compact_gap(sc->order);
inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE);
if (get_nr_swap_pages() > 0)
inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON);
continue;
switch (compaction_suitable(zone, sc->order, 0, sc->reclaim_idx)) {
- case COMPACT_PARTIAL:
+ case COMPACT_SUCCESS:
case COMPACT_CONTINUE:
return false;
default:
}
/*
- * Returns true if compaction should go ahead for a high-order request, or
- * the high-order allocation would succeed without compaction.
+ * Returns true if compaction should go ahead for a costly-order request, or
+ * the allocation would already succeed without compaction. Return false if we
+ * should reclaim first.
*/
static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
{
unsigned long watermark;
- bool watermark_ok;
+ enum compact_result suitable;
- /*
- * Compaction takes time to run and there are potentially other
- * callers using the pages just freed. Continue reclaiming until
- * there is a buffer of free pages available to give compaction
- * a reasonable chance of completing and allocating the page
- */
- watermark = high_wmark_pages(zone) + (2UL << sc->order);
- watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, sc->reclaim_idx);
-
- /*
- * If compaction is deferred, reclaim up to a point where
- * compaction will have a chance of success when re-enabled
- */
- if (compaction_deferred(zone, sc->order))
- return watermark_ok;
+ suitable = compaction_suitable(zone, sc->order, 0, sc->reclaim_idx);
+ if (suitable == COMPACT_SUCCESS)
+ /* Allocation should succeed already. Don't reclaim. */
+ return true;
+ if (suitable == COMPACT_SKIPPED)
+ /* Compaction cannot yet proceed. Do reclaim. */
+ return false;
/*
- * If compaction is not ready to start and allocation is not likely
- * to succeed without it, then keep reclaiming.
+ * Compaction is already possible, but it takes time to run and there
+ * are potentially other callers using the pages just freed. So proceed
+ * with reclaim to make a buffer of free pages available to give
+ * compaction a reasonable chance of completing and allocating the page.
+ * Note that we won't actually reclaim the whole buffer in one attempt
+ * as the target watermark in should_continue_reclaim() is lower. But if
+ * we are already above the high+gap watermark, don't reclaim at all.
*/
- if (compaction_suitable(zone, sc->order, 0, sc->reclaim_idx) == COMPACT_SKIPPED)
- return false;
+ watermark = high_wmark_pages(zone) + compact_gap(sc->order);
- return watermark_ok;
+ return zone_watermark_ok_safe(zone, 0, watermark, sc->reclaim_idx);
}
/*
*/
nid = mem_cgroup_select_victim_node(memcg);
- zonelist = NODE_DATA(nid)->node_zonelists;
+ zonelist = &NODE_DATA(nid)->node_zonelists[ZONELIST_FALLBACK];
trace_mm_vmscan_memcg_reclaim_begin(0,
sc.may_writepage,
* excessive reclaim. Assume that a process requested a high-order
* can direct reclaim/compact.
*/
- if (sc->order && sc->nr_reclaimed >= 2UL << sc->order)
+ if (sc->order && sc->nr_reclaimed >= compact_gap(sc->order))
sc->order = 0;
return sc->nr_scanned >= sc->nr_to_reclaim;
return 0;
}
-#ifdef CONFIG_PAGE_OWNER
-static void pagetypeinfo_showmixedcount_print(struct seq_file *m,
- pg_data_t *pgdat,
- struct zone *zone)
-{
- struct page *page;
- struct page_ext *page_ext;
- unsigned long pfn = zone->zone_start_pfn, block_end_pfn;
- unsigned long end_pfn = pfn + zone->spanned_pages;
- unsigned long count[MIGRATE_TYPES] = { 0, };
- int pageblock_mt, page_mt;
- int i;
-
- /* Scan block by block. First and last block may be incomplete */
- pfn = zone->zone_start_pfn;
-
- /*
- * Walk the zone in pageblock_nr_pages steps. If a page block spans
- * a zone boundary, it will be double counted between zones. This does
- * not matter as the mixed block count will still be correct
- */
- for (; pfn < end_pfn; ) {
- if (!pfn_valid(pfn)) {
- pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES);
- continue;
- }
-
- block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
- block_end_pfn = min(block_end_pfn, end_pfn);
-
- page = pfn_to_page(pfn);
- pageblock_mt = get_pageblock_migratetype(page);
-
- for (; pfn < block_end_pfn; pfn++) {
- if (!pfn_valid_within(pfn))
- continue;
-
- page = pfn_to_page(pfn);
-
- if (page_zone(page) != zone)
- continue;
-
- if (PageBuddy(page)) {
- pfn += (1UL << page_order(page)) - 1;
- continue;
- }
-
- if (PageReserved(page))
- continue;
-
- page_ext = lookup_page_ext(page);
- if (unlikely(!page_ext))
- continue;
-
- if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags))
- continue;
-
- page_mt = gfpflags_to_migratetype(page_ext->gfp_mask);
- if (pageblock_mt != page_mt) {
- if (is_migrate_cma(pageblock_mt))
- count[MIGRATE_MOVABLE]++;
- else
- count[pageblock_mt]++;
-
- pfn = block_end_pfn;
- break;
- }
- pfn += (1UL << page_ext->order) - 1;
- }
- }
-
- /* Print counts */
- seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
- for (i = 0; i < MIGRATE_TYPES; i++)
- seq_printf(m, "%12lu ", count[i]);
- seq_putc(m, '\n');
-}
-#endif /* CONFIG_PAGE_OWNER */
-
/*
* Print out the number of pageblocks for each migratetype that contain pages
* of other types. This gives an indication of how well fallbacks are being
{
unsigned long *l = arg;
unsigned long off = l - (unsigned long *)m->private;
- seq_printf(m, "%s %lu\n", vmstat_text[off], *l);
+
+ seq_puts(m, vmstat_text[off]);
+ seq_put_decimal_ull(m, " ", *l);
+ seq_putc(m, '\n');
return 0;
}
round_jiffies_relative(sysctl_stat_interval));
}
+static void __init init_cpu_node_state(void)
+{
+ int cpu;
+
+ get_online_cpus();
+ for_each_online_cpu(cpu)
+ node_set_state(cpu_to_node(cpu), N_CPU);
+ put_online_cpus();
+}
+
static void vmstat_cpu_dead(int node)
{
int cpu;
#ifdef CONFIG_SMP
cpu_notifier_register_begin();
__register_cpu_notifier(&vmstat_notifier);
+ init_cpu_node_state();
start_shepherd_timer();
cpu_notifier_register_done();
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#define CREATE_TRACE_POINTS
+
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/mount.h>
#include <linux/migrate.h>
#include <linux/pagemap.h>
+#include <trace/events/zsmalloc.h>
#define ZSPAGE_MAGIC 0x58
/* Destination page for migration which should be a first page
* of zspage. */
struct page *d_page;
- /* Starting object index within @s_page which used for live object
- * in the subpage. */
+ /* Starting object index within @s_page which used for live object
+ * in the subpage. */
int obj_idx;
+
+ unsigned long nr_migrated_obj;
+ unsigned long nr_freed_pages;
};
static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
free_obj = obj_malloc(class, get_zspage(d_page), handle);
zs_object_copy(class, free_obj, used_obj);
obj_idx++;
+ cc->nr_migrated_obj++;
/*
* record_obj updates handle's value to free_obj and it will
* invalidate lock bit(ie, HANDLE_PIN_BIT) of handle, which
static void __zs_compact(struct zs_pool *pool, struct size_class *class)
{
- struct zs_compact_control cc;
+ struct zs_compact_control cc = {
+ .nr_migrated_obj = 0,
+ .nr_freed_pages = 0,
+ };
struct zspage *src_zspage;
struct zspage *dst_zspage = NULL;
putback_zspage(class, dst_zspage);
if (putback_zspage(class, src_zspage) == ZS_EMPTY) {
free_zspage(pool, class, src_zspage);
- pool->stats.pages_compacted += class->pages_per_zspage;
+ cc.nr_freed_pages += class->pages_per_zspage;
}
spin_unlock(&class->lock);
cond_resched();
putback_zspage(class, src_zspage);
spin_unlock(&class->lock);
+
+ pool->stats.pages_compacted += cc.nr_freed_pages;
+ trace_zs_compact(class->index, cc.nr_migrated_obj, cc.nr_freed_pages);
}
unsigned long zs_compact(struct zs_pool *pool)
{
int i;
struct size_class *class;
+ unsigned long pages_compacted_before = pool->stats.pages_compacted;
+
+ trace_zs_compact_start(pool->name);
for (i = zs_size_classes - 1; i >= 0; i--) {
class = pool->size_class[i];
__zs_compact(pool, class);
}
+ trace_zs_compact_end(pool->name,
+ pool->stats.pages_compacted - pages_compacted_before);
+
return pool->stats.pages_compacted;
}
EXPORT_SYMBOL_GPL(zs_compact);
struct net *net = sock_net(sk);
kgid_t group = current_egid();
struct group_info *group_info;
- int i, j, count;
+ int i;
kgid_t low, high;
int ret = 0;
return 0;
group_info = get_current_groups();
- count = group_info->ngroups;
- for (i = 0; i < group_info->nblocks; i++) {
- int cp_count = min_t(int, NGROUPS_PER_BLOCK, count);
- for (j = 0; j < cp_count; j++) {
- kgid_t gid = group_info->blocks[i][j];
- if (gid_lte(low, gid) && gid_lte(gid, high))
- goto out_release_group;
- }
+ for (i = 0; i < group_info->ngroups; i++) {
+ kgid_t gid = group_info->gid[i];
- count -= cp_count;
+ if (gid_lte(low, gid) && gid_lte(gid, high))
+ goto out_release_group;
}
ret = -EACCES;
if (gcred->acred.group_info->ngroups != acred->group_info->ngroups)
goto out_nomatch;
for (i = 0; i < gcred->acred.group_info->ngroups; i++) {
- if (!gid_eq(GROUP_AT(gcred->acred.group_info, i),
- GROUP_AT(acred->group_info, i)))
+ if (!gid_eq(gcred->acred.group_info->gid[i],
+ acred->group_info->gid[i]))
goto out_nomatch;
}
out_match:
kgid = make_kgid(&init_user_ns, tmp);
if (!gid_valid(kgid))
goto out_free_groups;
- GROUP_AT(creds->cr_group_info, i) = kgid;
+ creds->cr_group_info->gid[i] = kgid;
}
return 0;
kgid = make_kgid(&init_user_ns, id);
if (!gid_valid(kgid))
goto out;
- GROUP_AT(rsci.cred.cr_group_info, i) = kgid;
+ rsci.cred.cr_group_info->gid[i] = kgid;
}
/* mech name */
cred->uc_gid = acred->gid;
for (i = 0; i < groups; i++)
- cred->uc_gids[i] = GROUP_AT(acred->group_info, i);
+ cred->uc_gids[i] = acred->group_info->gid[i];
if (i < NFS_NGROUPS)
cred->uc_gids[i] = INVALID_GID;
if (groups > NFS_NGROUPS)
groups = NFS_NGROUPS;
for (i = 0; i < groups ; i++)
- if (!gid_eq(cred->uc_gids[i], GROUP_AT(acred->group_info, i)))
+ if (!gid_eq(cred->uc_gids[i], acred->group_info->gid[i]))
return 0;
if (groups < NFS_NGROUPS && gid_valid(cred->uc_gids[groups]))
return 0;
kgid = make_kgid(&init_user_ns, gid);
if (!gid_valid(kgid))
goto out;
- GROUP_AT(ug.gi, i) = kgid;
+ ug.gi->gid[i] = kgid;
}
ugp = unix_gid_lookup(cd, uid);
seq_printf(m, "%u %d:", from_kuid_munged(user_ns, ug->uid), glen);
for (i = 0; i < glen; i++)
- seq_printf(m, " %d", from_kgid_munged(user_ns, GROUP_AT(ug->gi, i)));
+ seq_printf(m, " %d", from_kgid_munged(user_ns, ug->gi->gid[i]));
seq_printf(m, "\n");
return 0;
}
return SVC_CLOSE;
for (i = 0; i < slen; i++) {
kgid_t kgid = make_kgid(&init_user_ns, svc_getnl(argv));
- GROUP_AT(cred->cr_group_info, i) = kgid;
+ cred->cr_group_info->gid[i] = kgid;
}
if (svc_getu32(argv) != htonl(RPC_AUTH_NULL) || svc_getu32(argv) != 0) {
*authp = rpc_autherr_badverf;
my $spelling_file = "$D/spelling.txt";
my $codespell = 0;
my $codespellfile = "/usr/share/codespell/dictionary.txt";
+my $conststructsfile = "$D/const_structs.checkpatch";
my $color = 1;
my $allow_c99_comments = 1;
0[0-7][0-7][2367]
}x;
+our %mode_permission_string_types = (
+ "S_IRWXU" => 0700,
+ "S_IRUSR" => 0400,
+ "S_IWUSR" => 0200,
+ "S_IXUSR" => 0100,
+ "S_IRWXG" => 0070,
+ "S_IRGRP" => 0040,
+ "S_IWGRP" => 0020,
+ "S_IXGRP" => 0010,
+ "S_IRWXO" => 0007,
+ "S_IROTH" => 0004,
+ "S_IWOTH" => 0002,
+ "S_IXOTH" => 0001,
+ "S_IRWXUGO" => 0777,
+ "S_IRUGO" => 0444,
+ "S_IWUGO" => 0222,
+ "S_IXUGO" => 0111,
+);
+
+#Create a search pattern for all these strings to speed up a loop below
+our $mode_perms_string_search = "";
+foreach my $entry (keys %mode_permission_string_types) {
+ $mode_perms_string_search .= '|' if ($mode_perms_string_search ne "");
+ $mode_perms_string_search .= $entry;
+}
+
our $allowed_asm_includes = qr{(?x:
irq|
memory|
$misspellings = join("|", sort keys %spelling_fix) if keys %spelling_fix;
+my $const_structs = "";
+if (open(my $conststructs, '<', $conststructsfile)) {
+ while (<$conststructs>) {
+ my $line = $_;
+
+ $line =~ s/\s*\n?$//g;
+ $line =~ s/^\s*//g;
+
+ next if ($line =~ m/^\s*#/);
+ next if ($line =~ m/^\s*$/);
+ if ($line =~ /\s/) {
+ print("$conststructsfile: '$line' invalid - ignored\n");
+ next;
+ }
+
+ $const_structs .= '|' if ($const_structs ne "");
+ $const_structs .= $line;
+ }
+ close($conststructsfile);
+} else {
+ warn "No structs that should be const will be found - file '$conststructsfile': $!\n";
+}
+
sub build_types {
my $mods = "(?x: \n" . join("|\n ", (@modifierList, @modifierListFile)) . "\n)";
my $all = "(?x: \n" . join("|\n ", (@typeList, @typeListFile)) . "\n)";
}
}
+sub is_maintained_obsolete {
+ my ($filename) = @_;
+
+ return 0 if (!(-e "$root/scripts/get_maintainer.pl"));
+
+ my $status = `perl $root/scripts/get_maintainer.pl --status --nom --nol --nogit --nogit-fallback -f $filename 2>&1`;
+
+ return $status =~ /obsolete/i;
+}
+
my $camelcase_seeded = 0;
sub seed_camelcase_includes {
return if ($camelcase_seeded);
}
if ($found_file) {
+ if (is_maintained_obsolete($realfile)) {
+ WARN("OBSOLETE",
+ "$realfile is marked as 'obsolete' in the MAINTAINERS hierarchy. No unnecessary modifications please.\n");
+ }
if ($realfile =~ m@^(?:drivers/net/|net/|drivers/staging/)@) {
$check = 1;
} else {
"Block comments use a trailing */ on a separate line\n" . $herecurr);
}
+# Block comment * alignment
+ if ($prevline =~ /$;[ \t]*$/ && #ends in comment
+ $line =~ /^\+[ \t]*$;/ && #leading comment
+ $rawline =~ /^\+[ \t]*\*/ && #leading *
+ (($prevrawline =~ /^\+.*?\/\*/ && #leading /*
+ $prevrawline !~ /\*\/[ \t]*$/) || #no trailing */
+ $prevrawline =~ /^\+[ \t]*\*/)) { #leading *
+ my $oldindent;
+ $prevrawline =~ m@^\+([ \t]*/?)\*@;
+ if (defined($1)) {
+ $oldindent = expand_tabs($1);
+ } else {
+ $prevrawline =~ m@^\+(.*/?)\*@;
+ $oldindent = expand_tabs($1);
+ }
+ $rawline =~ m@^\+([ \t]*)\*@;
+ my $newindent = $1;
+ $newindent = expand_tabs($newindent);
+ if (length($oldindent) ne length($newindent)) {
+ WARN("BLOCK_COMMENT_STYLE",
+ "Block comments should align the * on each line\n" . $hereprev);
+ }
+ }
+
# check for missing blank lines after struct/union declarations
# with exceptions for various attributes and macros
if ($prevline =~ /^[\+ ]};?\s*$/ &&
}
# Check for memcpy(foo, bar, ETH_ALEN) that could be ether_addr_copy(foo, bar)
- if ($^V && $^V ge 5.10.0 &&
- defined $stat &&
- $stat =~ /^\+(?:.*?)\bmemcpy\s*\(\s*$FuncArg\s*,\s*$FuncArg\s*\,\s*ETH_ALEN\s*\)/) {
- if (WARN("PREFER_ETHER_ADDR_COPY",
- "Prefer ether_addr_copy() over memcpy() if the Ethernet addresses are __aligned(2)\n" . "$here\n$stat\n") &&
- $fix) {
- $fixed[$fixlinenr] =~ s/\bmemcpy\s*\(\s*$FuncArg\s*,\s*$FuncArg\s*\,\s*ETH_ALEN\s*\)/ether_addr_copy($2, $7)/;
- }
- }
+# if ($^V && $^V ge 5.10.0 &&
+# defined $stat &&
+# $stat =~ /^\+(?:.*?)\bmemcpy\s*\(\s*$FuncArg\s*,\s*$FuncArg\s*\,\s*ETH_ALEN\s*\)/) {
+# if (WARN("PREFER_ETHER_ADDR_COPY",
+# "Prefer ether_addr_copy() over memcpy() if the Ethernet addresses are __aligned(2)\n" . "$here\n$stat\n") &&
+# $fix) {
+# $fixed[$fixlinenr] =~ s/\bmemcpy\s*\(\s*$FuncArg\s*,\s*$FuncArg\s*\,\s*ETH_ALEN\s*\)/ether_addr_copy($2, $7)/;
+# }
+# }
# Check for memcmp(foo, bar, ETH_ALEN) that could be ether_addr_equal*(foo, bar)
- if ($^V && $^V ge 5.10.0 &&
- defined $stat &&
- $stat =~ /^\+(?:.*?)\bmemcmp\s*\(\s*$FuncArg\s*,\s*$FuncArg\s*\,\s*ETH_ALEN\s*\)/) {
- WARN("PREFER_ETHER_ADDR_EQUAL",
- "Prefer ether_addr_equal() or ether_addr_equal_unaligned() over memcmp()\n" . "$here\n$stat\n")
- }
+# if ($^V && $^V ge 5.10.0 &&
+# defined $stat &&
+# $stat =~ /^\+(?:.*?)\bmemcmp\s*\(\s*$FuncArg\s*,\s*$FuncArg\s*\,\s*ETH_ALEN\s*\)/) {
+# WARN("PREFER_ETHER_ADDR_EQUAL",
+# "Prefer ether_addr_equal() or ether_addr_equal_unaligned() over memcmp()\n" . "$here\n$stat\n")
+# }
# check for memset(foo, 0x0, ETH_ALEN) that could be eth_zero_addr
# check for memset(foo, 0xFF, ETH_ALEN) that could be eth_broadcast_addr
- if ($^V && $^V ge 5.10.0 &&
- defined $stat &&
- $stat =~ /^\+(?:.*?)\bmemset\s*\(\s*$FuncArg\s*,\s*$FuncArg\s*\,\s*ETH_ALEN\s*\)/) {
-
- my $ms_val = $7;
-
- if ($ms_val =~ /^(?:0x|)0+$/i) {
- if (WARN("PREFER_ETH_ZERO_ADDR",
- "Prefer eth_zero_addr over memset()\n" . "$here\n$stat\n") &&
- $fix) {
- $fixed[$fixlinenr] =~ s/\bmemset\s*\(\s*$FuncArg\s*,\s*$FuncArg\s*,\s*ETH_ALEN\s*\)/eth_zero_addr($2)/;
- }
- } elsif ($ms_val =~ /^(?:0xff|255)$/i) {
- if (WARN("PREFER_ETH_BROADCAST_ADDR",
- "Prefer eth_broadcast_addr() over memset()\n" . "$here\n$stat\n") &&
- $fix) {
- $fixed[$fixlinenr] =~ s/\bmemset\s*\(\s*$FuncArg\s*,\s*$FuncArg\s*,\s*ETH_ALEN\s*\)/eth_broadcast_addr($2)/;
- }
- }
- }
+# if ($^V && $^V ge 5.10.0 &&
+# defined $stat &&
+# $stat =~ /^\+(?:.*?)\bmemset\s*\(\s*$FuncArg\s*,\s*$FuncArg\s*\,\s*ETH_ALEN\s*\)/) {
+#
+# my $ms_val = $7;
+#
+# if ($ms_val =~ /^(?:0x|)0+$/i) {
+# if (WARN("PREFER_ETH_ZERO_ADDR",
+# "Prefer eth_zero_addr over memset()\n" . "$here\n$stat\n") &&
+# $fix) {
+# $fixed[$fixlinenr] =~ s/\bmemset\s*\(\s*$FuncArg\s*,\s*$FuncArg\s*,\s*ETH_ALEN\s*\)/eth_zero_addr($2)/;
+# }
+# } elsif ($ms_val =~ /^(?:0xff|255)$/i) {
+# if (WARN("PREFER_ETH_BROADCAST_ADDR",
+# "Prefer eth_broadcast_addr() over memset()\n" . "$here\n$stat\n") &&
+# $fix) {
+# $fixed[$fixlinenr] =~ s/\bmemset\s*\(\s*$FuncArg\s*,\s*$FuncArg\s*,\s*ETH_ALEN\s*\)/eth_broadcast_addr($2)/;
+# }
+# }
+# }
# typecasts on min/max could be min_t/max_t
if ($^V && $^V ge 5.10.0 &&
}
# check for various structs that are normally const (ops, kgdb, device_tree)
- my $const_structs = qr{
- acpi_dock_ops|
- address_space_operations|
- backlight_ops|
- block_device_operations|
- dentry_operations|
- dev_pm_ops|
- dma_map_ops|
- extent_io_ops|
- file_lock_operations|
- file_operations|
- hv_ops|
- ide_dma_ops|
- intel_dvo_dev_ops|
- item_operations|
- iwl_ops|
- kgdb_arch|
- kgdb_io|
- kset_uevent_ops|
- lock_manager_operations|
- microcode_ops|
- mtrr_ops|
- neigh_ops|
- nlmsvc_binding|
- of_device_id|
- pci_raw_ops|
- pipe_buf_operations|
- platform_hibernation_ops|
- platform_suspend_ops|
- proto_ops|
- rpc_pipe_ops|
- seq_operations|
- snd_ac97_build_ops|
- soc_pcmcia_socket_ops|
- stacktrace_ops|
- sysfs_ops|
- tty_operations|
- uart_ops|
- usb_mon_operations|
- wd_ops}x;
if ($line !~ /\bconst\b/ &&
$line =~ /\bstruct\s+($const_structs)\b/) {
WARN("CONST_STRUCT",
$arg_pos--;
$skip_args = "(?:\\s*$FuncArg\\s*,\\s*){$arg_pos,$arg_pos}";
}
- my $test = "\\b$func\\s*\\(${skip_args}([\\d]+)\\s*[,\\)]";
+ my $test = "\\b$func\\s*\\(${skip_args}($FuncArg(?:\\|\\s*$FuncArg)*)\\s*[,\\)]";
if ($line =~ /$test/) {
my $val = $1;
$val = $6 if ($skip_args ne "");
-
- if ($val !~ /^0$/ &&
- (($val =~ /^$Int$/ && $val !~ /^$Octal$/) ||
- length($val) ne 4)) {
+ if (($val =~ /^$Int$/ && $val !~ /^$Octal$/) ||
+ ($val =~ /^$Octal$/ && length($val) ne 4)) {
ERROR("NON_OCTAL_PERMISSIONS",
"Use 4 digit octal (0777) not decimal permissions\n" . $herecurr);
- } elsif ($val =~ /^$Octal$/ && (oct($val) & 02)) {
+ }
+ if ($val =~ /^$Octal$/ && (oct($val) & 02)) {
ERROR("EXPORTED_WORLD_WRITABLE",
"Exporting writable files is usually an error. Consider more restrictive permissions.\n" . $herecurr);
}
+ if ($val =~ /\b$mode_perms_string_search\b/) {
+ my $to = 0;
+ while ($val =~ /\b($mode_perms_string_search)\b(?:\s*\|\s*)?\s*/g) {
+ $to |= $mode_permission_string_types{$1};
+ }
+ my $new = sprintf("%04o", $to);
+ if (WARN("SYMBOLIC_PERMS",
+ "Symbolic permissions are not preferred. Consider using octal permissions $new.\n" . $herecurr) &&
+ $fix) {
+ $fixed[$fixlinenr] =~ s/\Q$val\E/$new/;
+ }
+ }
}
}
}
--- /dev/null
+acpi_dock_ops
+address_space_operations
+backlight_ops
+block_device_operations
+clk_ops
+comedi_lrange
+component_ops
+dentry_operations
+dev_pm_ops
+dma_map_ops
+driver_info
+drm_connector_funcs
+drm_encoder_funcs
+drm_encoder_helper_funcs
+ethtool_ops
+extent_io_ops
+file_lock_operations
+file_operations
+hv_ops
+ide_dma_ops
+ide_port_ops
+inode_operations
+intel_dvo_dev_ops
+irq_domain_ops
+item_operations
+iwl_cfg
+iwl_ops
+kgdb_arch
+kgdb_io
+kset_uevent_ops
+lock_manager_operations
+machine_desc
+microcode_ops
+mlxsw_reg_info
+mtrr_ops
+neigh_ops
+net_device_ops
+nlmsvc_binding
+nvkm_device_chip
+of_device_id
+pci_raw_ops
+pipe_buf_operations
+platform_hibernation_ops
+platform_suspend_ops
+proto_ops
+regmap_access_table
+rpc_pipe_ops
+rtc_class_ops
+sd_desc
+seq_operations
+sirfsoc_padmux
+snd_ac97_build_ops
+snd_soc_component_driver
+soc_pcmcia_socket_ops
+stacktrace_ops
+sysfs_ops
+tty_operations
+uart_ops
+usb_mon_operations
+v4l2_ctrl_ops
+v4l2_ioctl_ops
+vm_operations_struct
+wacom_features
+wd_ops
devicetable-offsets-file := devicetable-offsets.h
-define sed-y
- "/^->/{s:->#\(.*\):/* \1 */:; \
- s:^->\([^ ]*\) [\$$#]*\([-0-9]*\) \(.*\):#define \1 \2 /* \3 */:; \
- s:^->\([^ ]*\) [\$$#]*\([^ ]*\) \(.*\):#define \1 \2 /* \3 */:; \
- s:->::; p;}"
-endef
-
quiet_cmd_offsets = GEN $@
define cmd_offsets
(set -e; \
echo " * This file was generated by Kbuild"; \
echo " *"; \
echo " */"; \
- echo ""; \
- sed -ne $(sed-y) $<; \
- echo ""; \
+ sed -ne '/#define/{s/\$$//;s/#//2;s/$$/*\//;p;}' $<; \
echo "#endif" ) > $@
endef
#define DATA_SECTIONS ".data", ".data.rel"
#define TEXT_SECTIONS ".text", ".text.unlikely", ".sched.text", \
- ".kprobes.text"
+ ".kprobes.text", ".cpuidle.text"
#define OTHER_TEXT_SECTIONS ".ref.text", ".head.text", ".spinlock.text", \
".fixup", ".entry.text", ".exception.text", ".text.*", \
".coldtext"
strcmp(".spinlock.text", txtname) == 0 ||
strcmp(".irqentry.text", txtname) == 0 ||
strcmp(".kprobes.text", txtname) == 0 ||
+ strcmp(".cpuidle.text", txtname) == 0 ||
strcmp(".text.unlikely", txtname) == 0;
}
".spinlock.text" => 1,
".irqentry.text" => 1,
".kprobes.text" => 1,
+ ".cpuidle.text" => 1,
".text.unlikely" => 1,
);
miximum||maximum
mmnemonic||mnemonic
mnay||many
-modeled||modelled
modulues||modules
monochorome||monochrome
monochromo||monochrome
LDFLAGS += -lpthread -lurcu
TARGETS = main
OFILES = main.o radix-tree.o linux.o test.o tag_check.o find_next_bit.o \
- regression1.o regression2.o regression3.o multiorder.o
+ regression1.o regression2.o regression3.o multiorder.o \
+ iteration_check.o
targets: $(TARGETS)
--- /dev/null
+/*
+ * iteration_check.c: test races having to do with radix tree iteration
+ * Copyright (c) 2016 Intel Corporation
+ * Author: Ross Zwisler <ross.zwisler@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+#include <linux/radix-tree.h>
+#include <pthread.h>
+#include "test.h"
+
+#define NUM_THREADS 4
+#define TAG 0
+static pthread_mutex_t tree_lock = PTHREAD_MUTEX_INITIALIZER;
+static pthread_t threads[NUM_THREADS];
+RADIX_TREE(tree, GFP_KERNEL);
+bool test_complete;
+
+/* relentlessly fill the tree with tagged entries */
+static void *add_entries_fn(void *arg)
+{
+ int pgoff;
+
+ while (!test_complete) {
+ for (pgoff = 0; pgoff < 100; pgoff++) {
+ pthread_mutex_lock(&tree_lock);
+ if (item_insert(&tree, pgoff) == 0)
+ item_tag_set(&tree, pgoff, TAG);
+ pthread_mutex_unlock(&tree_lock);
+ }
+ }
+
+ return NULL;
+}
+
+/*
+ * Iterate over the tagged entries, doing a radix_tree_iter_retry() as we find
+ * things that have been removed and randomly resetting our iteration to the
+ * next chunk with radix_tree_iter_next(). Both radix_tree_iter_retry() and
+ * radix_tree_iter_next() cause radix_tree_next_slot() to be called with a
+ * NULL 'slot' variable.
+ */
+static void *tagged_iteration_fn(void *arg)
+{
+ struct radix_tree_iter iter;
+ void **slot;
+
+ while (!test_complete) {
+ rcu_read_lock();
+ radix_tree_for_each_tagged(slot, &tree, &iter, 0, TAG) {
+ void *entry;
+ int i;
+
+ /* busy wait to let removals happen */
+ for (i = 0; i < 1000000; i++)
+ ;
+
+ entry = radix_tree_deref_slot(slot);
+ if (unlikely(!entry))
+ continue;
+
+ if (radix_tree_deref_retry(entry)) {
+ slot = radix_tree_iter_retry(&iter);
+ continue;
+ }
+
+ if (rand() % 50 == 0)
+ slot = radix_tree_iter_next(&iter);
+ }
+ rcu_read_unlock();
+ }
+
+ return NULL;
+}
+
+/*
+ * Iterate over the entries, doing a radix_tree_iter_retry() as we find things
+ * that have been removed and randomly resetting our iteration to the next
+ * chunk with radix_tree_iter_next(). Both radix_tree_iter_retry() and
+ * radix_tree_iter_next() cause radix_tree_next_slot() to be called with a
+ * NULL 'slot' variable.
+ */
+static void *untagged_iteration_fn(void *arg)
+{
+ struct radix_tree_iter iter;
+ void **slot;
+
+ while (!test_complete) {
+ rcu_read_lock();
+ radix_tree_for_each_slot(slot, &tree, &iter, 0) {
+ void *entry;
+ int i;
+
+ /* busy wait to let removals happen */
+ for (i = 0; i < 1000000; i++)
+ ;
+
+ entry = radix_tree_deref_slot(slot);
+ if (unlikely(!entry))
+ continue;
+
+ if (radix_tree_deref_retry(entry)) {
+ slot = radix_tree_iter_retry(&iter);
+ continue;
+ }
+
+ if (rand() % 50 == 0)
+ slot = radix_tree_iter_next(&iter);
+ }
+ rcu_read_unlock();
+ }
+
+ return NULL;
+}
+
+/*
+ * Randomly remove entries to help induce radix_tree_iter_retry() calls in the
+ * two iteration functions.
+ */
+static void *remove_entries_fn(void *arg)
+{
+ while (!test_complete) {
+ int pgoff;
+
+ pgoff = rand() % 100;
+
+ pthread_mutex_lock(&tree_lock);
+ item_delete(&tree, pgoff);
+ pthread_mutex_unlock(&tree_lock);
+ }
+
+ return NULL;
+}
+
+/* This is a unit test for a bug found by the syzkaller tester */
+void iteration_test(void)
+{
+ int i;
+
+ printf("Running iteration tests for 10 seconds\n");
+
+ srand(time(0));
+ test_complete = false;
+
+ if (pthread_create(&threads[0], NULL, tagged_iteration_fn, NULL)) {
+ perror("pthread_create");
+ exit(1);
+ }
+ if (pthread_create(&threads[1], NULL, untagged_iteration_fn, NULL)) {
+ perror("pthread_create");
+ exit(1);
+ }
+ if (pthread_create(&threads[2], NULL, add_entries_fn, NULL)) {
+ perror("pthread_create");
+ exit(1);
+ }
+ if (pthread_create(&threads[3], NULL, remove_entries_fn, NULL)) {
+ perror("pthread_create");
+ exit(1);
+ }
+
+ sleep(10);
+ test_complete = true;
+
+ for (i = 0; i < NUM_THREADS; i++) {
+ if (pthread_join(threads[i], NULL)) {
+ perror("pthread_join");
+ exit(1);
+ }
+ }
+
+ item_kill_tree(&tree);
+}
regression1_test();
regression2_test();
regression3_test();
+ iteration_test();
single_thread_tests(long_run);
sleep(1);
#include "regression.h"
static RADIX_TREE(mt_tree, GFP_KERNEL);
-static pthread_mutex_t mt_lock;
+static pthread_mutex_t mt_lock = PTHREAD_MUTEX_INITIALIZER;
struct page {
pthread_mutex_t lock;
void tag_check(void);
void multiorder_checks(void);
+void iteration_test(void);
struct item *
item_tag_set(struct radix_tree_root *root, unsigned long index, int tag);
on-fault-limit
transhuge-stress
userfaultfd
+mlock-intersect-test
BINARIES += thuge-gen
BINARIES += transhuge-stress
BINARIES += userfaultfd
+BINARIES += mlock-random-test
all: $(BINARIES)
%: %.c
userfaultfd: userfaultfd.c ../../../../usr/include/linux/kernel.h
$(CC) $(CFLAGS) -O2 -o $@ $< -lpthread
+mlock-random-test: mlock-random-test.c
+ $(CC) $(CFLAGS) -o $@ $< -lcap
+
../../../../usr/include/linux/kernel.h:
make -C ../../../.. headers_install
--- /dev/null
+/*
+ * It tests the mlock/mlock2() when they are invoked
+ * on randomly memory region.
+ */
+#include <unistd.h>
+#include <sys/resource.h>
+#include <sys/capability.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+#include <string.h>
+#include <sys/ipc.h>
+#include <sys/shm.h>
+#include <time.h>
+#include "mlock2.h"
+
+#define CHUNK_UNIT (128 * 1024)
+#define MLOCK_RLIMIT_SIZE (CHUNK_UNIT * 2)
+#define MLOCK_WITHIN_LIMIT_SIZE CHUNK_UNIT
+#define MLOCK_OUTOF_LIMIT_SIZE (CHUNK_UNIT * 3)
+
+#define TEST_LOOP 100
+#define PAGE_ALIGN(size, ps) (((size) + ((ps) - 1)) & ~((ps) - 1))
+
+int set_cap_limits(rlim_t max)
+{
+ struct rlimit new;
+ cap_t cap = cap_init();
+
+ new.rlim_cur = max;
+ new.rlim_max = max;
+ if (setrlimit(RLIMIT_MEMLOCK, &new)) {
+ perror("setrlimit() returns error\n");
+ return -1;
+ }
+
+ /* drop capabilities including CAP_IPC_LOCK */
+ if (cap_set_proc(cap)) {
+ perror("cap_set_proc() returns error\n");
+ return -2;
+ }
+
+ return 0;
+}
+
+int get_proc_locked_vm_size(void)
+{
+ FILE *f;
+ int ret = -1;
+ char line[1024] = {0};
+ unsigned long lock_size = 0;
+
+ f = fopen("/proc/self/status", "r");
+ if (!f) {
+ perror("fopen");
+ return -1;
+ }
+
+ while (fgets(line, 1024, f)) {
+ if (strstr(line, "VmLck")) {
+ ret = sscanf(line, "VmLck:\t%8lu kB", &lock_size);
+ if (ret <= 0) {
+ printf("sscanf() on VmLck error: %s: %d\n",
+ line, ret);
+ fclose(f);
+ return -1;
+ }
+ fclose(f);
+ return (int)(lock_size << 10);
+ }
+ }
+
+ perror("cann't parse VmLck in /proc/self/status\n");
+ fclose(f);
+ return -1;
+}
+
+/*
+ * Get the MMUPageSize of the memory region including input
+ * address from proc file.
+ *
+ * return value: on error case, 0 will be returned.
+ * Otherwise the page size(in bytes) is returned.
+ */
+int get_proc_page_size(unsigned long addr)
+{
+ FILE *smaps;
+ char *line;
+ unsigned long mmupage_size = 0;
+ size_t size;
+
+ smaps = seek_to_smaps_entry(addr);
+ if (!smaps) {
+ printf("Unable to parse /proc/self/smaps\n");
+ return 0;
+ }
+
+ while (getline(&line, &size, smaps) > 0) {
+ if (!strstr(line, "MMUPageSize")) {
+ free(line);
+ line = NULL;
+ size = 0;
+ continue;
+ }
+
+ /* found the MMUPageSize of this section */
+ if (sscanf(line, "MMUPageSize: %8lu kB",
+ &mmupage_size) < 1) {
+ printf("Unable to parse smaps entry for Size:%s\n",
+ line);
+ break;
+ }
+
+ }
+ free(line);
+ if (smaps)
+ fclose(smaps);
+ return mmupage_size << 10;
+}
+
+/*
+ * Test mlock/mlock2() on provided memory chunk.
+ * It expects the mlock/mlock2() to be successful (within rlimit)
+ *
+ * With allocated memory chunk [p, p + alloc_size), this
+ * test will choose start/len randomly to perform mlock/mlock2
+ * [start, start + len] memory range. The range is within range
+ * of the allocated chunk.
+ *
+ * The memory region size alloc_size is within the rlimit.
+ * So we always expect a success of mlock/mlock2.
+ *
+ * VmLck is assumed to be 0 before this test.
+ *
+ * return value: 0 - success
+ * else: failure
+ */
+int test_mlock_within_limit(char *p, int alloc_size)
+{
+ int i;
+ int ret = 0;
+ int locked_vm_size = 0;
+ struct rlimit cur;
+ int page_size = 0;
+
+ getrlimit(RLIMIT_MEMLOCK, &cur);
+ if (cur.rlim_cur < alloc_size) {
+ printf("alloc_size[%d] < %u rlimit,lead to mlock failure\n",
+ alloc_size, (unsigned int)cur.rlim_cur);
+ return -1;
+ }
+
+ srand(time(NULL));
+ for (i = 0; i < TEST_LOOP; i++) {
+ /*
+ * - choose mlock/mlock2 randomly
+ * - choose lock_size randomly but lock_size < alloc_size
+ * - choose start_offset randomly but p+start_offset+lock_size
+ * < p+alloc_size
+ */
+ int is_mlock = !!(rand() % 2);
+ int lock_size = rand() % alloc_size;
+ int start_offset = rand() % (alloc_size - lock_size);
+
+ if (is_mlock)
+ ret = mlock(p + start_offset, lock_size);
+ else
+ ret = mlock2_(p + start_offset, lock_size,
+ MLOCK_ONFAULT);
+
+ if (ret) {
+ printf("%s() failure at |%p(%d)| mlock:|%p(%d)|\n",
+ is_mlock ? "mlock" : "mlock2",
+ p, alloc_size,
+ p + start_offset, lock_size);
+ return ret;
+ }
+ }
+
+ /*
+ * Check VmLck left by the tests.
+ */
+ locked_vm_size = get_proc_locked_vm_size();
+ page_size = get_proc_page_size((unsigned long)p);
+ if (page_size == 0) {
+ printf("cannot get proc MMUPageSize\n");
+ return -1;
+ }
+
+ if (locked_vm_size > PAGE_ALIGN(alloc_size, page_size) + page_size) {
+ printf("test_mlock_within_limit() left VmLck:%d on %d chunk\n",
+ locked_vm_size, alloc_size);
+ return -1;
+ }
+
+ return 0;
+}
+
+
+/*
+ * We expect the mlock/mlock2() to be fail (outof limitation)
+ *
+ * With allocated memory chunk [p, p + alloc_size), this
+ * test will randomly choose start/len and perform mlock/mlock2
+ * on [start, start+len] range.
+ *
+ * The memory region size alloc_size is above the rlimit.
+ * And the len to be locked is higher than rlimit.
+ * So we always expect a failure of mlock/mlock2.
+ * No locked page number should be increased as a side effect.
+ *
+ * return value: 0 - success
+ * else: failure
+ */
+int test_mlock_outof_limit(char *p, int alloc_size)
+{
+ int i;
+ int ret = 0;
+ int locked_vm_size = 0, old_locked_vm_size = 0;
+ struct rlimit cur;
+
+ getrlimit(RLIMIT_MEMLOCK, &cur);
+ if (cur.rlim_cur >= alloc_size) {
+ printf("alloc_size[%d] >%u rlimit, violates test condition\n",
+ alloc_size, (unsigned int)cur.rlim_cur);
+ return -1;
+ }
+
+ old_locked_vm_size = get_proc_locked_vm_size();
+ srand(time(NULL));
+ for (i = 0; i < TEST_LOOP; i++) {
+ int is_mlock = !!(rand() % 2);
+ int lock_size = (rand() % (alloc_size - cur.rlim_cur))
+ + cur.rlim_cur;
+ int start_offset = rand() % (alloc_size - lock_size);
+
+ if (is_mlock)
+ ret = mlock(p + start_offset, lock_size);
+ else
+ ret = mlock2_(p + start_offset, lock_size,
+ MLOCK_ONFAULT);
+ if (ret == 0) {
+ printf("%s() succeeds? on %p(%d) mlock%p(%d)\n",
+ is_mlock ? "mlock" : "mlock2",
+ p, alloc_size,
+ p + start_offset, lock_size);
+ return -1;
+ }
+ }
+
+ locked_vm_size = get_proc_locked_vm_size();
+ if (locked_vm_size != old_locked_vm_size) {
+ printf("tests leads to new mlocked page: old[%d], new[%d]\n",
+ old_locked_vm_size,
+ locked_vm_size);
+ return -1;
+ }
+
+ return 0;
+}
+
+int main(int argc, char **argv)
+{
+ char *p = NULL;
+ int ret = 0;
+
+ if (set_cap_limits(MLOCK_RLIMIT_SIZE))
+ return -1;
+
+ p = malloc(MLOCK_WITHIN_LIMIT_SIZE);
+ if (p == NULL) {
+ perror("malloc() failure\n");
+ return -1;
+ }
+ ret = test_mlock_within_limit(p, MLOCK_WITHIN_LIMIT_SIZE);
+ if (ret)
+ return ret;
+ munlock(p, MLOCK_WITHIN_LIMIT_SIZE);
+ free(p);
+
+
+ p = malloc(MLOCK_OUTOF_LIMIT_SIZE);
+ if (p == NULL) {
+ perror("malloc() failure\n");
+ return -1;
+ }
+ ret = test_mlock_outof_limit(p, MLOCK_OUTOF_LIMIT_SIZE);
+ if (ret)
+ return ret;
+ munlock(p, MLOCK_OUTOF_LIMIT_SIZE);
+ free(p);
+
+ return 0;
+}
#define _GNU_SOURCE
#include <sys/mman.h>
#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <sys/time.h>
#include <sys/resource.h>
-#include <syscall.h>
-#include <errno.h>
#include <stdbool.h>
-
-#ifndef MLOCK_ONFAULT
-#define MLOCK_ONFAULT 1
-#endif
-
-#ifndef MCL_ONFAULT
-#define MCL_ONFAULT (MCL_FUTURE << 1)
-#endif
-
-static int mlock2_(void *start, size_t len, int flags)
-{
-#ifdef __NR_mlock2
- return syscall(__NR_mlock2, start, len, flags);
-#else
- errno = ENOSYS;
- return -1;
-#endif
-}
+#include "mlock2.h"
struct vm_boundaries {
unsigned long start;
return flags;
}
-static FILE *seek_to_smaps_entry(unsigned long addr)
-{
- FILE *file;
- char *line = NULL;
- size_t size = 0;
- unsigned long start, end;
- char perms[5];
- unsigned long offset;
- char dev[32];
- unsigned long inode;
- char path[BUFSIZ];
-
- file = fopen("/proc/self/smaps", "r");
- if (!file) {
- perror("fopen smaps");
- _exit(1);
- }
-
- while (getline(&line, &size, file) > 0) {
- if (sscanf(line, "%lx-%lx %s %lx %s %lu %s\n",
- &start, &end, perms, &offset, dev, &inode, path) < 6)
- goto next;
-
- if (start <= addr && addr < end)
- goto out;
-
-next:
- free(line);
- line = NULL;
- size = 0;
- }
-
- fclose(file);
- file = NULL;
-
-out:
- free(line);
- return file;
-}
-
#define VMFLAGS "VmFlags:"
static bool is_vmflag_set(unsigned long addr, const char *vmflag)
--- /dev/null
+#include <syscall.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#ifndef MLOCK_ONFAULT
+#define MLOCK_ONFAULT 1
+#endif
+
+#ifndef MCL_ONFAULT
+#define MCL_ONFAULT (MCL_FUTURE << 1)
+#endif
+
+static int mlock2_(void *start, size_t len, int flags)
+{
+#ifdef __NR_mlock2
+ return syscall(__NR_mlock2, start, len, flags);
+#else
+ errno = ENOSYS;
+ return -1;
+#endif
+}
+
+static FILE *seek_to_smaps_entry(unsigned long addr)
+{
+ FILE *file;
+ char *line = NULL;
+ size_t size = 0;
+ unsigned long start, end;
+ char perms[5];
+ unsigned long offset;
+ char dev[32];
+ unsigned long inode;
+ char path[BUFSIZ];
+
+ file = fopen("/proc/self/smaps", "r");
+ if (!file) {
+ perror("fopen smaps");
+ _exit(1);
+ }
+
+ while (getline(&line, &size, file) > 0) {
+ if (sscanf(line, "%lx-%lx %s %lx %s %lu %s\n",
+ &start, &end, perms, &offset, dev, &inode, path) < 6)
+ goto next;
+
+ if (start <= addr && addr < end)
+ goto out;
+
+next:
+ free(line);
+ line = NULL;
+ size = 0;
+ }
+
+ fclose(file);
+ file = NULL;
+
+out:
+ free(line);
+ return file;
+}
+