thp: don't allow transparent hugepage support without PSE
[deliverable/linux.git] / mm / huge_memory.c
CommitLineData
71e3aac0
AA
1/*
2 * Copyright (C) 2009 Red Hat, Inc.
3 *
4 * This work is licensed under the terms of the GNU GPL, version 2. See
5 * the COPYING file in the top-level directory.
6 */
7
8#include <linux/mm.h>
9#include <linux/sched.h>
10#include <linux/highmem.h>
11#include <linux/hugetlb.h>
12#include <linux/mmu_notifier.h>
13#include <linux/rmap.h>
14#include <linux/swap.h>
ba76149f
AA
15#include <linux/mm_inline.h>
16#include <linux/kthread.h>
17#include <linux/khugepaged.h>
71e3aac0
AA
18#include <asm/tlb.h>
19#include <asm/pgalloc.h>
20#include "internal.h"
21
ba76149f
AA
22/*
23 * By default transparent hugepage support is enabled for all mappings
24 * and khugepaged scans all mappings. Defrag is only invoked by
25 * khugepaged hugepage allocations and by page faults inside
26 * MADV_HUGEPAGE regions to avoid the risk of slowing down short lived
27 * allocations.
28 */
71e3aac0 29unsigned long transparent_hugepage_flags __read_mostly =
13ece886 30#ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS
ba76149f 31 (1<<TRANSPARENT_HUGEPAGE_FLAG)|
13ece886
AA
32#endif
33#ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE
34 (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)|
35#endif
d39d33c3 36 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)|
ba76149f
AA
37 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
38
39/* default scan 8*512 pte (or vmas) every 30 second */
40static unsigned int khugepaged_pages_to_scan __read_mostly = HPAGE_PMD_NR*8;
41static unsigned int khugepaged_pages_collapsed;
42static unsigned int khugepaged_full_scans;
43static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000;
44/* during fragmentation poll the hugepage allocator once every minute */
45static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000;
46static struct task_struct *khugepaged_thread __read_mostly;
47static DEFINE_MUTEX(khugepaged_mutex);
48static DEFINE_SPINLOCK(khugepaged_mm_lock);
49static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
50/*
51 * default collapse hugepages if there is at least one pte mapped like
52 * it would have happened if the vma was large enough during page
53 * fault.
54 */
55static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1;
56
57static int khugepaged(void *none);
58static int mm_slots_hash_init(void);
59static int khugepaged_slab_init(void);
60static void khugepaged_slab_free(void);
61
62#define MM_SLOTS_HASH_HEADS 1024
63static struct hlist_head *mm_slots_hash __read_mostly;
64static struct kmem_cache *mm_slot_cache __read_mostly;
65
66/**
67 * struct mm_slot - hash lookup from mm to mm_slot
68 * @hash: hash collision list
69 * @mm_node: khugepaged scan list headed in khugepaged_scan.mm_head
70 * @mm: the mm that this information is valid for
71 */
72struct mm_slot {
73 struct hlist_node hash;
74 struct list_head mm_node;
75 struct mm_struct *mm;
76};
77
78/**
79 * struct khugepaged_scan - cursor for scanning
80 * @mm_head: the head of the mm list to scan
81 * @mm_slot: the current mm_slot we are scanning
82 * @address: the next address inside that to be scanned
83 *
84 * There is only the one khugepaged_scan instance of this cursor structure.
85 */
86struct khugepaged_scan {
87 struct list_head mm_head;
88 struct mm_slot *mm_slot;
89 unsigned long address;
90} khugepaged_scan = {
91 .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
92};
93
f000565a
AA
94
95static int set_recommended_min_free_kbytes(void)
96{
97 struct zone *zone;
98 int nr_zones = 0;
99 unsigned long recommended_min;
100 extern int min_free_kbytes;
101
102 if (!test_bit(TRANSPARENT_HUGEPAGE_FLAG,
103 &transparent_hugepage_flags) &&
104 !test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
105 &transparent_hugepage_flags))
106 return 0;
107
108 for_each_populated_zone(zone)
109 nr_zones++;
110
111 /* Make sure at least 2 hugepages are free for MIGRATE_RESERVE */
112 recommended_min = pageblock_nr_pages * nr_zones * 2;
113
114 /*
115 * Make sure that on average at least two pageblocks are almost free
116 * of another type, one for a migratetype to fall back to and a
117 * second to avoid subsequent fallbacks of other types There are 3
118 * MIGRATE_TYPES we care about.
119 */
120 recommended_min += pageblock_nr_pages * nr_zones *
121 MIGRATE_PCPTYPES * MIGRATE_PCPTYPES;
122
123 /* don't ever allow to reserve more than 5% of the lowmem */
124 recommended_min = min(recommended_min,
125 (unsigned long) nr_free_buffer_pages() / 20);
126 recommended_min <<= (PAGE_SHIFT-10);
127
128 if (recommended_min > min_free_kbytes)
129 min_free_kbytes = recommended_min;
130 setup_per_zone_wmarks();
131 return 0;
132}
133late_initcall(set_recommended_min_free_kbytes);
134
ba76149f
AA
135static int start_khugepaged(void)
136{
137 int err = 0;
138 if (khugepaged_enabled()) {
139 int wakeup;
140 if (unlikely(!mm_slot_cache || !mm_slots_hash)) {
141 err = -ENOMEM;
142 goto out;
143 }
144 mutex_lock(&khugepaged_mutex);
145 if (!khugepaged_thread)
146 khugepaged_thread = kthread_run(khugepaged, NULL,
147 "khugepaged");
148 if (unlikely(IS_ERR(khugepaged_thread))) {
149 printk(KERN_ERR
150 "khugepaged: kthread_run(khugepaged) failed\n");
151 err = PTR_ERR(khugepaged_thread);
152 khugepaged_thread = NULL;
153 }
154 wakeup = !list_empty(&khugepaged_scan.mm_head);
155 mutex_unlock(&khugepaged_mutex);
156 if (wakeup)
157 wake_up_interruptible(&khugepaged_wait);
f000565a
AA
158
159 set_recommended_min_free_kbytes();
ba76149f
AA
160 } else
161 /* wakeup to exit */
162 wake_up_interruptible(&khugepaged_wait);
163out:
164 return err;
165}
71e3aac0
AA
166
167#ifdef CONFIG_SYSFS
ba76149f 168
71e3aac0
AA
169static ssize_t double_flag_show(struct kobject *kobj,
170 struct kobj_attribute *attr, char *buf,
171 enum transparent_hugepage_flag enabled,
172 enum transparent_hugepage_flag req_madv)
173{
174 if (test_bit(enabled, &transparent_hugepage_flags)) {
175 VM_BUG_ON(test_bit(req_madv, &transparent_hugepage_flags));
176 return sprintf(buf, "[always] madvise never\n");
177 } else if (test_bit(req_madv, &transparent_hugepage_flags))
178 return sprintf(buf, "always [madvise] never\n");
179 else
180 return sprintf(buf, "always madvise [never]\n");
181}
182static ssize_t double_flag_store(struct kobject *kobj,
183 struct kobj_attribute *attr,
184 const char *buf, size_t count,
185 enum transparent_hugepage_flag enabled,
186 enum transparent_hugepage_flag req_madv)
187{
188 if (!memcmp("always", buf,
189 min(sizeof("always")-1, count))) {
190 set_bit(enabled, &transparent_hugepage_flags);
191 clear_bit(req_madv, &transparent_hugepage_flags);
192 } else if (!memcmp("madvise", buf,
193 min(sizeof("madvise")-1, count))) {
194 clear_bit(enabled, &transparent_hugepage_flags);
195 set_bit(req_madv, &transparent_hugepage_flags);
196 } else if (!memcmp("never", buf,
197 min(sizeof("never")-1, count))) {
198 clear_bit(enabled, &transparent_hugepage_flags);
199 clear_bit(req_madv, &transparent_hugepage_flags);
200 } else
201 return -EINVAL;
202
203 return count;
204}
205
206static ssize_t enabled_show(struct kobject *kobj,
207 struct kobj_attribute *attr, char *buf)
208{
209 return double_flag_show(kobj, attr, buf,
210 TRANSPARENT_HUGEPAGE_FLAG,
211 TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
212}
213static ssize_t enabled_store(struct kobject *kobj,
214 struct kobj_attribute *attr,
215 const char *buf, size_t count)
216{
ba76149f
AA
217 ssize_t ret;
218
219 ret = double_flag_store(kobj, attr, buf, count,
220 TRANSPARENT_HUGEPAGE_FLAG,
221 TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
222
223 if (ret > 0) {
224 int err = start_khugepaged();
225 if (err)
226 ret = err;
227 }
228
f000565a
AA
229 if (ret > 0 &&
230 (test_bit(TRANSPARENT_HUGEPAGE_FLAG,
231 &transparent_hugepage_flags) ||
232 test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
233 &transparent_hugepage_flags)))
234 set_recommended_min_free_kbytes();
235
ba76149f 236 return ret;
71e3aac0
AA
237}
238static struct kobj_attribute enabled_attr =
239 __ATTR(enabled, 0644, enabled_show, enabled_store);
240
241static ssize_t single_flag_show(struct kobject *kobj,
242 struct kobj_attribute *attr, char *buf,
243 enum transparent_hugepage_flag flag)
244{
245 if (test_bit(flag, &transparent_hugepage_flags))
246 return sprintf(buf, "[yes] no\n");
247 else
248 return sprintf(buf, "yes [no]\n");
249}
250static ssize_t single_flag_store(struct kobject *kobj,
251 struct kobj_attribute *attr,
252 const char *buf, size_t count,
253 enum transparent_hugepage_flag flag)
254{
255 if (!memcmp("yes", buf,
256 min(sizeof("yes")-1, count))) {
257 set_bit(flag, &transparent_hugepage_flags);
258 } else if (!memcmp("no", buf,
259 min(sizeof("no")-1, count))) {
260 clear_bit(flag, &transparent_hugepage_flags);
261 } else
262 return -EINVAL;
263
264 return count;
265}
266
267/*
268 * Currently defrag only disables __GFP_NOWAIT for allocation. A blind
269 * __GFP_REPEAT is too aggressive, it's never worth swapping tons of
270 * memory just to allocate one more hugepage.
271 */
272static ssize_t defrag_show(struct kobject *kobj,
273 struct kobj_attribute *attr, char *buf)
274{
275 return double_flag_show(kobj, attr, buf,
276 TRANSPARENT_HUGEPAGE_DEFRAG_FLAG,
277 TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG);
278}
279static ssize_t defrag_store(struct kobject *kobj,
280 struct kobj_attribute *attr,
281 const char *buf, size_t count)
282{
283 return double_flag_store(kobj, attr, buf, count,
284 TRANSPARENT_HUGEPAGE_DEFRAG_FLAG,
285 TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG);
286}
287static struct kobj_attribute defrag_attr =
288 __ATTR(defrag, 0644, defrag_show, defrag_store);
289
290#ifdef CONFIG_DEBUG_VM
291static ssize_t debug_cow_show(struct kobject *kobj,
292 struct kobj_attribute *attr, char *buf)
293{
294 return single_flag_show(kobj, attr, buf,
295 TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
296}
297static ssize_t debug_cow_store(struct kobject *kobj,
298 struct kobj_attribute *attr,
299 const char *buf, size_t count)
300{
301 return single_flag_store(kobj, attr, buf, count,
302 TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
303}
304static struct kobj_attribute debug_cow_attr =
305 __ATTR(debug_cow, 0644, debug_cow_show, debug_cow_store);
306#endif /* CONFIG_DEBUG_VM */
307
308static struct attribute *hugepage_attr[] = {
309 &enabled_attr.attr,
310 &defrag_attr.attr,
311#ifdef CONFIG_DEBUG_VM
312 &debug_cow_attr.attr,
313#endif
314 NULL,
315};
316
317static struct attribute_group hugepage_attr_group = {
318 .attrs = hugepage_attr,
ba76149f
AA
319};
320
321static ssize_t scan_sleep_millisecs_show(struct kobject *kobj,
322 struct kobj_attribute *attr,
323 char *buf)
324{
325 return sprintf(buf, "%u\n", khugepaged_scan_sleep_millisecs);
326}
327
328static ssize_t scan_sleep_millisecs_store(struct kobject *kobj,
329 struct kobj_attribute *attr,
330 const char *buf, size_t count)
331{
332 unsigned long msecs;
333 int err;
334
335 err = strict_strtoul(buf, 10, &msecs);
336 if (err || msecs > UINT_MAX)
337 return -EINVAL;
338
339 khugepaged_scan_sleep_millisecs = msecs;
340 wake_up_interruptible(&khugepaged_wait);
341
342 return count;
343}
344static struct kobj_attribute scan_sleep_millisecs_attr =
345 __ATTR(scan_sleep_millisecs, 0644, scan_sleep_millisecs_show,
346 scan_sleep_millisecs_store);
347
348static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj,
349 struct kobj_attribute *attr,
350 char *buf)
351{
352 return sprintf(buf, "%u\n", khugepaged_alloc_sleep_millisecs);
353}
354
355static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj,
356 struct kobj_attribute *attr,
357 const char *buf, size_t count)
358{
359 unsigned long msecs;
360 int err;
361
362 err = strict_strtoul(buf, 10, &msecs);
363 if (err || msecs > UINT_MAX)
364 return -EINVAL;
365
366 khugepaged_alloc_sleep_millisecs = msecs;
367 wake_up_interruptible(&khugepaged_wait);
368
369 return count;
370}
371static struct kobj_attribute alloc_sleep_millisecs_attr =
372 __ATTR(alloc_sleep_millisecs, 0644, alloc_sleep_millisecs_show,
373 alloc_sleep_millisecs_store);
374
375static ssize_t pages_to_scan_show(struct kobject *kobj,
376 struct kobj_attribute *attr,
377 char *buf)
378{
379 return sprintf(buf, "%u\n", khugepaged_pages_to_scan);
380}
381static ssize_t pages_to_scan_store(struct kobject *kobj,
382 struct kobj_attribute *attr,
383 const char *buf, size_t count)
384{
385 int err;
386 unsigned long pages;
387
388 err = strict_strtoul(buf, 10, &pages);
389 if (err || !pages || pages > UINT_MAX)
390 return -EINVAL;
391
392 khugepaged_pages_to_scan = pages;
393
394 return count;
395}
396static struct kobj_attribute pages_to_scan_attr =
397 __ATTR(pages_to_scan, 0644, pages_to_scan_show,
398 pages_to_scan_store);
399
400static ssize_t pages_collapsed_show(struct kobject *kobj,
401 struct kobj_attribute *attr,
402 char *buf)
403{
404 return sprintf(buf, "%u\n", khugepaged_pages_collapsed);
405}
406static struct kobj_attribute pages_collapsed_attr =
407 __ATTR_RO(pages_collapsed);
408
409static ssize_t full_scans_show(struct kobject *kobj,
410 struct kobj_attribute *attr,
411 char *buf)
412{
413 return sprintf(buf, "%u\n", khugepaged_full_scans);
414}
415static struct kobj_attribute full_scans_attr =
416 __ATTR_RO(full_scans);
417
418static ssize_t khugepaged_defrag_show(struct kobject *kobj,
419 struct kobj_attribute *attr, char *buf)
420{
421 return single_flag_show(kobj, attr, buf,
422 TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
423}
424static ssize_t khugepaged_defrag_store(struct kobject *kobj,
425 struct kobj_attribute *attr,
426 const char *buf, size_t count)
427{
428 return single_flag_store(kobj, attr, buf, count,
429 TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
430}
431static struct kobj_attribute khugepaged_defrag_attr =
432 __ATTR(defrag, 0644, khugepaged_defrag_show,
433 khugepaged_defrag_store);
434
435/*
436 * max_ptes_none controls if khugepaged should collapse hugepages over
437 * any unmapped ptes in turn potentially increasing the memory
438 * footprint of the vmas. When max_ptes_none is 0 khugepaged will not
439 * reduce the available free memory in the system as it
440 * runs. Increasing max_ptes_none will instead potentially reduce the
441 * free memory in the system during the khugepaged scan.
442 */
443static ssize_t khugepaged_max_ptes_none_show(struct kobject *kobj,
444 struct kobj_attribute *attr,
445 char *buf)
446{
447 return sprintf(buf, "%u\n", khugepaged_max_ptes_none);
448}
449static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj,
450 struct kobj_attribute *attr,
451 const char *buf, size_t count)
452{
453 int err;
454 unsigned long max_ptes_none;
455
456 err = strict_strtoul(buf, 10, &max_ptes_none);
457 if (err || max_ptes_none > HPAGE_PMD_NR-1)
458 return -EINVAL;
459
460 khugepaged_max_ptes_none = max_ptes_none;
461
462 return count;
463}
464static struct kobj_attribute khugepaged_max_ptes_none_attr =
465 __ATTR(max_ptes_none, 0644, khugepaged_max_ptes_none_show,
466 khugepaged_max_ptes_none_store);
467
468static struct attribute *khugepaged_attr[] = {
469 &khugepaged_defrag_attr.attr,
470 &khugepaged_max_ptes_none_attr.attr,
471 &pages_to_scan_attr.attr,
472 &pages_collapsed_attr.attr,
473 &full_scans_attr.attr,
474 &scan_sleep_millisecs_attr.attr,
475 &alloc_sleep_millisecs_attr.attr,
476 NULL,
477};
478
479static struct attribute_group khugepaged_attr_group = {
480 .attrs = khugepaged_attr,
481 .name = "khugepaged",
71e3aac0
AA
482};
483#endif /* CONFIG_SYSFS */
484
485static int __init hugepage_init(void)
486{
71e3aac0 487 int err;
ba76149f
AA
488#ifdef CONFIG_SYSFS
489 static struct kobject *hugepage_kobj;
4b7167b9 490#endif
71e3aac0 491
4b7167b9
AA
492 err = -EINVAL;
493 if (!has_transparent_hugepage()) {
494 transparent_hugepage_flags = 0;
495 goto out;
496 }
497
498#ifdef CONFIG_SYSFS
ba76149f
AA
499 err = -ENOMEM;
500 hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
501 if (unlikely(!hugepage_kobj)) {
502 printk(KERN_ERR "hugepage: failed kobject create\n");
503 goto out;
504 }
505
506 err = sysfs_create_group(hugepage_kobj, &hugepage_attr_group);
507 if (err) {
508 printk(KERN_ERR "hugepage: failed register hugeage group\n");
509 goto out;
510 }
511
512 err = sysfs_create_group(hugepage_kobj, &khugepaged_attr_group);
513 if (err) {
514 printk(KERN_ERR "hugepage: failed register hugeage group\n");
515 goto out;
516 }
71e3aac0 517#endif
ba76149f
AA
518
519 err = khugepaged_slab_init();
520 if (err)
521 goto out;
522
523 err = mm_slots_hash_init();
524 if (err) {
525 khugepaged_slab_free();
526 goto out;
527 }
528
529 start_khugepaged();
530
f000565a
AA
531 set_recommended_min_free_kbytes();
532
ba76149f
AA
533out:
534 return err;
71e3aac0
AA
535}
536module_init(hugepage_init)
537
538static int __init setup_transparent_hugepage(char *str)
539{
540 int ret = 0;
541 if (!str)
542 goto out;
543 if (!strcmp(str, "always")) {
544 set_bit(TRANSPARENT_HUGEPAGE_FLAG,
545 &transparent_hugepage_flags);
546 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
547 &transparent_hugepage_flags);
548 ret = 1;
549 } else if (!strcmp(str, "madvise")) {
550 clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
551 &transparent_hugepage_flags);
552 set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
553 &transparent_hugepage_flags);
554 ret = 1;
555 } else if (!strcmp(str, "never")) {
556 clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
557 &transparent_hugepage_flags);
558 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
559 &transparent_hugepage_flags);
560 ret = 1;
561 }
562out:
563 if (!ret)
564 printk(KERN_WARNING
565 "transparent_hugepage= cannot parse, ignored\n");
566 return ret;
567}
568__setup("transparent_hugepage=", setup_transparent_hugepage);
569
570static void prepare_pmd_huge_pte(pgtable_t pgtable,
571 struct mm_struct *mm)
572{
573 assert_spin_locked(&mm->page_table_lock);
574
575 /* FIFO */
576 if (!mm->pmd_huge_pte)
577 INIT_LIST_HEAD(&pgtable->lru);
578 else
579 list_add(&pgtable->lru, &mm->pmd_huge_pte->lru);
580 mm->pmd_huge_pte = pgtable;
581}
582
583static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
584{
585 if (likely(vma->vm_flags & VM_WRITE))
586 pmd = pmd_mkwrite(pmd);
587 return pmd;
588}
589
590static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
591 struct vm_area_struct *vma,
592 unsigned long haddr, pmd_t *pmd,
593 struct page *page)
594{
595 int ret = 0;
596 pgtable_t pgtable;
597
598 VM_BUG_ON(!PageCompound(page));
599 pgtable = pte_alloc_one(mm, haddr);
600 if (unlikely(!pgtable)) {
b9bbfbe3 601 mem_cgroup_uncharge_page(page);
71e3aac0
AA
602 put_page(page);
603 return VM_FAULT_OOM;
604 }
605
606 clear_huge_page(page, haddr, HPAGE_PMD_NR);
607 __SetPageUptodate(page);
608
609 spin_lock(&mm->page_table_lock);
610 if (unlikely(!pmd_none(*pmd))) {
611 spin_unlock(&mm->page_table_lock);
b9bbfbe3 612 mem_cgroup_uncharge_page(page);
71e3aac0
AA
613 put_page(page);
614 pte_free(mm, pgtable);
615 } else {
616 pmd_t entry;
617 entry = mk_pmd(page, vma->vm_page_prot);
618 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
619 entry = pmd_mkhuge(entry);
620 /*
621 * The spinlocking to take the lru_lock inside
622 * page_add_new_anon_rmap() acts as a full memory
623 * barrier to be sure clear_huge_page writes become
624 * visible after the set_pmd_at() write.
625 */
626 page_add_new_anon_rmap(page, vma, haddr);
627 set_pmd_at(mm, haddr, pmd, entry);
628 prepare_pmd_huge_pte(pgtable, mm);
629 add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
630 spin_unlock(&mm->page_table_lock);
631 }
632
633 return ret;
634}
635
0bbbc0b3
AA
636static inline gfp_t alloc_hugepage_gfpmask(int defrag)
637{
638 return GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT);
639}
640
641static inline struct page *alloc_hugepage_vma(int defrag,
642 struct vm_area_struct *vma,
643 unsigned long haddr)
644{
645 return alloc_pages_vma(alloc_hugepage_gfpmask(defrag),
646 HPAGE_PMD_ORDER, vma, haddr);
647}
648
649#ifndef CONFIG_NUMA
71e3aac0
AA
650static inline struct page *alloc_hugepage(int defrag)
651{
0bbbc0b3 652 return alloc_pages(alloc_hugepage_gfpmask(defrag),
71e3aac0
AA
653 HPAGE_PMD_ORDER);
654}
0bbbc0b3 655#endif
71e3aac0
AA
656
657int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
658 unsigned long address, pmd_t *pmd,
659 unsigned int flags)
660{
661 struct page *page;
662 unsigned long haddr = address & HPAGE_PMD_MASK;
663 pte_t *pte;
664
665 if (haddr >= vma->vm_start && haddr + HPAGE_PMD_SIZE <= vma->vm_end) {
666 if (unlikely(anon_vma_prepare(vma)))
667 return VM_FAULT_OOM;
ba76149f
AA
668 if (unlikely(khugepaged_enter(vma)))
669 return VM_FAULT_OOM;
0bbbc0b3
AA
670 page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
671 vma, haddr);
71e3aac0
AA
672 if (unlikely(!page))
673 goto out;
b9bbfbe3
AA
674 if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) {
675 put_page(page);
676 goto out;
677 }
71e3aac0
AA
678
679 return __do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page);
680 }
681out:
682 /*
683 * Use __pte_alloc instead of pte_alloc_map, because we can't
684 * run pte_offset_map on the pmd, if an huge pmd could
685 * materialize from under us from a different thread.
686 */
687 if (unlikely(__pte_alloc(mm, vma, pmd, address)))
688 return VM_FAULT_OOM;
689 /* if an huge pmd materialized from under us just retry later */
690 if (unlikely(pmd_trans_huge(*pmd)))
691 return 0;
692 /*
693 * A regular pmd is established and it can't morph into a huge pmd
694 * from under us anymore at this point because we hold the mmap_sem
695 * read mode and khugepaged takes it in write mode. So now it's
696 * safe to run pte_offset_map().
697 */
698 pte = pte_offset_map(pmd, address);
699 return handle_pte_fault(mm, vma, address, pte, pmd, flags);
700}
701
702int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
703 pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
704 struct vm_area_struct *vma)
705{
706 struct page *src_page;
707 pmd_t pmd;
708 pgtable_t pgtable;
709 int ret;
710
711 ret = -ENOMEM;
712 pgtable = pte_alloc_one(dst_mm, addr);
713 if (unlikely(!pgtable))
714 goto out;
715
716 spin_lock(&dst_mm->page_table_lock);
717 spin_lock_nested(&src_mm->page_table_lock, SINGLE_DEPTH_NESTING);
718
719 ret = -EAGAIN;
720 pmd = *src_pmd;
721 if (unlikely(!pmd_trans_huge(pmd))) {
722 pte_free(dst_mm, pgtable);
723 goto out_unlock;
724 }
725 if (unlikely(pmd_trans_splitting(pmd))) {
726 /* split huge page running from under us */
727 spin_unlock(&src_mm->page_table_lock);
728 spin_unlock(&dst_mm->page_table_lock);
729 pte_free(dst_mm, pgtable);
730
731 wait_split_huge_page(vma->anon_vma, src_pmd); /* src_vma */
732 goto out;
733 }
734 src_page = pmd_page(pmd);
735 VM_BUG_ON(!PageHead(src_page));
736 get_page(src_page);
737 page_dup_rmap(src_page);
738 add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
739
740 pmdp_set_wrprotect(src_mm, addr, src_pmd);
741 pmd = pmd_mkold(pmd_wrprotect(pmd));
742 set_pmd_at(dst_mm, addr, dst_pmd, pmd);
743 prepare_pmd_huge_pte(pgtable, dst_mm);
744
745 ret = 0;
746out_unlock:
747 spin_unlock(&src_mm->page_table_lock);
748 spin_unlock(&dst_mm->page_table_lock);
749out:
750 return ret;
751}
752
753/* no "address" argument so destroys page coloring of some arch */
754pgtable_t get_pmd_huge_pte(struct mm_struct *mm)
755{
756 pgtable_t pgtable;
757
758 assert_spin_locked(&mm->page_table_lock);
759
760 /* FIFO */
761 pgtable = mm->pmd_huge_pte;
762 if (list_empty(&pgtable->lru))
763 mm->pmd_huge_pte = NULL;
764 else {
765 mm->pmd_huge_pte = list_entry(pgtable->lru.next,
766 struct page, lru);
767 list_del(&pgtable->lru);
768 }
769 return pgtable;
770}
771
772static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
773 struct vm_area_struct *vma,
774 unsigned long address,
775 pmd_t *pmd, pmd_t orig_pmd,
776 struct page *page,
777 unsigned long haddr)
778{
779 pgtable_t pgtable;
780 pmd_t _pmd;
781 int ret = 0, i;
782 struct page **pages;
783
784 pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR,
785 GFP_KERNEL);
786 if (unlikely(!pages)) {
787 ret |= VM_FAULT_OOM;
788 goto out;
789 }
790
791 for (i = 0; i < HPAGE_PMD_NR; i++) {
792 pages[i] = alloc_page_vma(GFP_HIGHUSER_MOVABLE,
793 vma, address);
b9bbfbe3
AA
794 if (unlikely(!pages[i] ||
795 mem_cgroup_newpage_charge(pages[i], mm,
796 GFP_KERNEL))) {
797 if (pages[i])
71e3aac0 798 put_page(pages[i]);
b9bbfbe3
AA
799 mem_cgroup_uncharge_start();
800 while (--i >= 0) {
801 mem_cgroup_uncharge_page(pages[i]);
802 put_page(pages[i]);
803 }
804 mem_cgroup_uncharge_end();
71e3aac0
AA
805 kfree(pages);
806 ret |= VM_FAULT_OOM;
807 goto out;
808 }
809 }
810
811 for (i = 0; i < HPAGE_PMD_NR; i++) {
812 copy_user_highpage(pages[i], page + i,
813 haddr + PAGE_SHIFT*i, vma);
814 __SetPageUptodate(pages[i]);
815 cond_resched();
816 }
817
818 spin_lock(&mm->page_table_lock);
819 if (unlikely(!pmd_same(*pmd, orig_pmd)))
820 goto out_free_pages;
821 VM_BUG_ON(!PageHead(page));
822
823 pmdp_clear_flush_notify(vma, haddr, pmd);
824 /* leave pmd empty until pte is filled */
825
826 pgtable = get_pmd_huge_pte(mm);
827 pmd_populate(mm, &_pmd, pgtable);
828
829 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
830 pte_t *pte, entry;
831 entry = mk_pte(pages[i], vma->vm_page_prot);
832 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
833 page_add_new_anon_rmap(pages[i], vma, haddr);
834 pte = pte_offset_map(&_pmd, haddr);
835 VM_BUG_ON(!pte_none(*pte));
836 set_pte_at(mm, haddr, pte, entry);
837 pte_unmap(pte);
838 }
839 kfree(pages);
840
841 mm->nr_ptes++;
842 smp_wmb(); /* make pte visible before pmd */
843 pmd_populate(mm, pmd, pgtable);
844 page_remove_rmap(page);
845 spin_unlock(&mm->page_table_lock);
846
847 ret |= VM_FAULT_WRITE;
848 put_page(page);
849
850out:
851 return ret;
852
853out_free_pages:
854 spin_unlock(&mm->page_table_lock);
b9bbfbe3
AA
855 mem_cgroup_uncharge_start();
856 for (i = 0; i < HPAGE_PMD_NR; i++) {
857 mem_cgroup_uncharge_page(pages[i]);
71e3aac0 858 put_page(pages[i]);
b9bbfbe3
AA
859 }
860 mem_cgroup_uncharge_end();
71e3aac0
AA
861 kfree(pages);
862 goto out;
863}
864
865int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
866 unsigned long address, pmd_t *pmd, pmd_t orig_pmd)
867{
868 int ret = 0;
869 struct page *page, *new_page;
870 unsigned long haddr;
871
872 VM_BUG_ON(!vma->anon_vma);
873 spin_lock(&mm->page_table_lock);
874 if (unlikely(!pmd_same(*pmd, orig_pmd)))
875 goto out_unlock;
876
877 page = pmd_page(orig_pmd);
878 VM_BUG_ON(!PageCompound(page) || !PageHead(page));
879 haddr = address & HPAGE_PMD_MASK;
880 if (page_mapcount(page) == 1) {
881 pmd_t entry;
882 entry = pmd_mkyoung(orig_pmd);
883 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
884 if (pmdp_set_access_flags(vma, haddr, pmd, entry, 1))
885 update_mmu_cache(vma, address, entry);
886 ret |= VM_FAULT_WRITE;
887 goto out_unlock;
888 }
889 get_page(page);
890 spin_unlock(&mm->page_table_lock);
891
892 if (transparent_hugepage_enabled(vma) &&
893 !transparent_hugepage_debug_cow())
0bbbc0b3
AA
894 new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
895 vma, haddr);
71e3aac0
AA
896 else
897 new_page = NULL;
898
899 if (unlikely(!new_page)) {
900 ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
901 pmd, orig_pmd, page, haddr);
902 put_page(page);
903 goto out;
904 }
905
b9bbfbe3
AA
906 if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
907 put_page(new_page);
908 put_page(page);
909 ret |= VM_FAULT_OOM;
910 goto out;
911 }
912
71e3aac0
AA
913 copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR);
914 __SetPageUptodate(new_page);
915
916 spin_lock(&mm->page_table_lock);
917 put_page(page);
b9bbfbe3
AA
918 if (unlikely(!pmd_same(*pmd, orig_pmd))) {
919 mem_cgroup_uncharge_page(new_page);
71e3aac0 920 put_page(new_page);
b9bbfbe3 921 } else {
71e3aac0
AA
922 pmd_t entry;
923 VM_BUG_ON(!PageHead(page));
924 entry = mk_pmd(new_page, vma->vm_page_prot);
925 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
926 entry = pmd_mkhuge(entry);
927 pmdp_clear_flush_notify(vma, haddr, pmd);
928 page_add_new_anon_rmap(new_page, vma, haddr);
929 set_pmd_at(mm, haddr, pmd, entry);
930 update_mmu_cache(vma, address, entry);
931 page_remove_rmap(page);
932 put_page(page);
933 ret |= VM_FAULT_WRITE;
934 }
935out_unlock:
936 spin_unlock(&mm->page_table_lock);
937out:
938 return ret;
939}
940
941struct page *follow_trans_huge_pmd(struct mm_struct *mm,
942 unsigned long addr,
943 pmd_t *pmd,
944 unsigned int flags)
945{
946 struct page *page = NULL;
947
948 assert_spin_locked(&mm->page_table_lock);
949
950 if (flags & FOLL_WRITE && !pmd_write(*pmd))
951 goto out;
952
953 page = pmd_page(*pmd);
954 VM_BUG_ON(!PageHead(page));
955 if (flags & FOLL_TOUCH) {
956 pmd_t _pmd;
957 /*
958 * We should set the dirty bit only for FOLL_WRITE but
959 * for now the dirty bit in the pmd is meaningless.
960 * And if the dirty bit will become meaningful and
961 * we'll only set it with FOLL_WRITE, an atomic
962 * set_bit will be required on the pmd to set the
963 * young bit, instead of the current set_pmd_at.
964 */
965 _pmd = pmd_mkyoung(pmd_mkdirty(*pmd));
966 set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, _pmd);
967 }
968 page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
969 VM_BUG_ON(!PageCompound(page));
970 if (flags & FOLL_GET)
971 get_page(page);
972
973out:
974 return page;
975}
976
977int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
978 pmd_t *pmd)
979{
980 int ret = 0;
981
982 spin_lock(&tlb->mm->page_table_lock);
983 if (likely(pmd_trans_huge(*pmd))) {
984 if (unlikely(pmd_trans_splitting(*pmd))) {
985 spin_unlock(&tlb->mm->page_table_lock);
986 wait_split_huge_page(vma->anon_vma,
987 pmd);
988 } else {
989 struct page *page;
990 pgtable_t pgtable;
991 pgtable = get_pmd_huge_pte(tlb->mm);
992 page = pmd_page(*pmd);
993 pmd_clear(pmd);
994 page_remove_rmap(page);
995 VM_BUG_ON(page_mapcount(page) < 0);
996 add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
997 VM_BUG_ON(!PageHead(page));
998 spin_unlock(&tlb->mm->page_table_lock);
999 tlb_remove_page(tlb, page);
1000 pte_free(tlb->mm, pgtable);
1001 ret = 1;
1002 }
1003 } else
1004 spin_unlock(&tlb->mm->page_table_lock);
1005
1006 return ret;
1007}
1008
0ca1634d
JW
1009int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1010 unsigned long addr, unsigned long end,
1011 unsigned char *vec)
1012{
1013 int ret = 0;
1014
1015 spin_lock(&vma->vm_mm->page_table_lock);
1016 if (likely(pmd_trans_huge(*pmd))) {
1017 ret = !pmd_trans_splitting(*pmd);
1018 spin_unlock(&vma->vm_mm->page_table_lock);
1019 if (unlikely(!ret))
1020 wait_split_huge_page(vma->anon_vma, pmd);
1021 else {
1022 /*
1023 * All logical pages in the range are present
1024 * if backed by a huge page.
1025 */
1026 memset(vec, 1, (end - addr) >> PAGE_SHIFT);
1027 }
1028 } else
1029 spin_unlock(&vma->vm_mm->page_table_lock);
1030
1031 return ret;
1032}
1033
cd7548ab
JW
1034int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1035 unsigned long addr, pgprot_t newprot)
1036{
1037 struct mm_struct *mm = vma->vm_mm;
1038 int ret = 0;
1039
1040 spin_lock(&mm->page_table_lock);
1041 if (likely(pmd_trans_huge(*pmd))) {
1042 if (unlikely(pmd_trans_splitting(*pmd))) {
1043 spin_unlock(&mm->page_table_lock);
1044 wait_split_huge_page(vma->anon_vma, pmd);
1045 } else {
1046 pmd_t entry;
1047
1048 entry = pmdp_get_and_clear(mm, addr, pmd);
1049 entry = pmd_modify(entry, newprot);
1050 set_pmd_at(mm, addr, pmd, entry);
1051 spin_unlock(&vma->vm_mm->page_table_lock);
1052 flush_tlb_range(vma, addr, addr + HPAGE_PMD_SIZE);
1053 ret = 1;
1054 }
1055 } else
1056 spin_unlock(&vma->vm_mm->page_table_lock);
1057
1058 return ret;
1059}
1060
71e3aac0
AA
1061pmd_t *page_check_address_pmd(struct page *page,
1062 struct mm_struct *mm,
1063 unsigned long address,
1064 enum page_check_address_pmd_flag flag)
1065{
1066 pgd_t *pgd;
1067 pud_t *pud;
1068 pmd_t *pmd, *ret = NULL;
1069
1070 if (address & ~HPAGE_PMD_MASK)
1071 goto out;
1072
1073 pgd = pgd_offset(mm, address);
1074 if (!pgd_present(*pgd))
1075 goto out;
1076
1077 pud = pud_offset(pgd, address);
1078 if (!pud_present(*pud))
1079 goto out;
1080
1081 pmd = pmd_offset(pud, address);
1082 if (pmd_none(*pmd))
1083 goto out;
1084 if (pmd_page(*pmd) != page)
1085 goto out;
94fcc585
AA
1086 /*
1087 * split_vma() may create temporary aliased mappings. There is
1088 * no risk as long as all huge pmd are found and have their
1089 * splitting bit set before __split_huge_page_refcount
1090 * runs. Finding the same huge pmd more than once during the
1091 * same rmap walk is not a problem.
1092 */
1093 if (flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG &&
1094 pmd_trans_splitting(*pmd))
1095 goto out;
71e3aac0
AA
1096 if (pmd_trans_huge(*pmd)) {
1097 VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG &&
1098 !pmd_trans_splitting(*pmd));
1099 ret = pmd;
1100 }
1101out:
1102 return ret;
1103}
1104
1105static int __split_huge_page_splitting(struct page *page,
1106 struct vm_area_struct *vma,
1107 unsigned long address)
1108{
1109 struct mm_struct *mm = vma->vm_mm;
1110 pmd_t *pmd;
1111 int ret = 0;
1112
1113 spin_lock(&mm->page_table_lock);
1114 pmd = page_check_address_pmd(page, mm, address,
1115 PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG);
1116 if (pmd) {
1117 /*
1118 * We can't temporarily set the pmd to null in order
1119 * to split it, the pmd must remain marked huge at all
1120 * times or the VM won't take the pmd_trans_huge paths
1121 * and it won't wait on the anon_vma->root->lock to
1122 * serialize against split_huge_page*.
1123 */
1124 pmdp_splitting_flush_notify(vma, address, pmd);
1125 ret = 1;
1126 }
1127 spin_unlock(&mm->page_table_lock);
1128
1129 return ret;
1130}
1131
1132static void __split_huge_page_refcount(struct page *page)
1133{
1134 int i;
1135 unsigned long head_index = page->index;
1136 struct zone *zone = page_zone(page);
1137
1138 /* prevent PageLRU to go away from under us, and freeze lru stats */
1139 spin_lock_irq(&zone->lru_lock);
1140 compound_lock(page);
1141
1142 for (i = 1; i < HPAGE_PMD_NR; i++) {
1143 struct page *page_tail = page + i;
1144
1145 /* tail_page->_count cannot change */
1146 atomic_sub(atomic_read(&page_tail->_count), &page->_count);
1147 BUG_ON(page_count(page) <= 0);
1148 atomic_add(page_mapcount(page) + 1, &page_tail->_count);
1149 BUG_ON(atomic_read(&page_tail->_count) <= 0);
1150
1151 /* after clearing PageTail the gup refcount can be released */
1152 smp_mb();
1153
1154 page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
1155 page_tail->flags |= (page->flags &
1156 ((1L << PG_referenced) |
1157 (1L << PG_swapbacked) |
1158 (1L << PG_mlocked) |
1159 (1L << PG_uptodate)));
1160 page_tail->flags |= (1L << PG_dirty);
1161
1162 /*
1163 * 1) clear PageTail before overwriting first_page
1164 * 2) clear PageTail before clearing PageHead for VM_BUG_ON
1165 */
1166 smp_wmb();
1167
1168 /*
1169 * __split_huge_page_splitting() already set the
1170 * splitting bit in all pmd that could map this
1171 * hugepage, that will ensure no CPU can alter the
1172 * mapcount on the head page. The mapcount is only
1173 * accounted in the head page and it has to be
1174 * transferred to all tail pages in the below code. So
1175 * for this code to be safe, the split the mapcount
1176 * can't change. But that doesn't mean userland can't
1177 * keep changing and reading the page contents while
1178 * we transfer the mapcount, so the pmd splitting
1179 * status is achieved setting a reserved bit in the
1180 * pmd, not by clearing the present bit.
1181 */
1182 BUG_ON(page_mapcount(page_tail));
1183 page_tail->_mapcount = page->_mapcount;
1184
1185 BUG_ON(page_tail->mapping);
1186 page_tail->mapping = page->mapping;
1187
1188 page_tail->index = ++head_index;
1189
1190 BUG_ON(!PageAnon(page_tail));
1191 BUG_ON(!PageUptodate(page_tail));
1192 BUG_ON(!PageDirty(page_tail));
1193 BUG_ON(!PageSwapBacked(page_tail));
1194
1195 lru_add_page_tail(zone, page, page_tail);
1196 }
1197
79134171
AA
1198 __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
1199 __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR);
1200
71e3aac0
AA
1201 ClearPageCompound(page);
1202 compound_unlock(page);
1203 spin_unlock_irq(&zone->lru_lock);
1204
1205 for (i = 1; i < HPAGE_PMD_NR; i++) {
1206 struct page *page_tail = page + i;
1207 BUG_ON(page_count(page_tail) <= 0);
1208 /*
1209 * Tail pages may be freed if there wasn't any mapping
1210 * like if add_to_swap() is running on a lru page that
1211 * had its mapping zapped. And freeing these pages
1212 * requires taking the lru_lock so we do the put_page
1213 * of the tail pages after the split is complete.
1214 */
1215 put_page(page_tail);
1216 }
1217
1218 /*
1219 * Only the head page (now become a regular page) is required
1220 * to be pinned by the caller.
1221 */
1222 BUG_ON(page_count(page) <= 0);
1223}
1224
1225static int __split_huge_page_map(struct page *page,
1226 struct vm_area_struct *vma,
1227 unsigned long address)
1228{
1229 struct mm_struct *mm = vma->vm_mm;
1230 pmd_t *pmd, _pmd;
1231 int ret = 0, i;
1232 pgtable_t pgtable;
1233 unsigned long haddr;
1234
1235 spin_lock(&mm->page_table_lock);
1236 pmd = page_check_address_pmd(page, mm, address,
1237 PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG);
1238 if (pmd) {
1239 pgtable = get_pmd_huge_pte(mm);
1240 pmd_populate(mm, &_pmd, pgtable);
1241
1242 for (i = 0, haddr = address; i < HPAGE_PMD_NR;
1243 i++, haddr += PAGE_SIZE) {
1244 pte_t *pte, entry;
1245 BUG_ON(PageCompound(page+i));
1246 entry = mk_pte(page + i, vma->vm_page_prot);
1247 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1248 if (!pmd_write(*pmd))
1249 entry = pte_wrprotect(entry);
1250 else
1251 BUG_ON(page_mapcount(page) != 1);
1252 if (!pmd_young(*pmd))
1253 entry = pte_mkold(entry);
1254 pte = pte_offset_map(&_pmd, haddr);
1255 BUG_ON(!pte_none(*pte));
1256 set_pte_at(mm, haddr, pte, entry);
1257 pte_unmap(pte);
1258 }
1259
1260 mm->nr_ptes++;
1261 smp_wmb(); /* make pte visible before pmd */
1262 /*
1263 * Up to this point the pmd is present and huge and
1264 * userland has the whole access to the hugepage
1265 * during the split (which happens in place). If we
1266 * overwrite the pmd with the not-huge version
1267 * pointing to the pte here (which of course we could
1268 * if all CPUs were bug free), userland could trigger
1269 * a small page size TLB miss on the small sized TLB
1270 * while the hugepage TLB entry is still established
1271 * in the huge TLB. Some CPU doesn't like that. See
1272 * http://support.amd.com/us/Processor_TechDocs/41322.pdf,
1273 * Erratum 383 on page 93. Intel should be safe but is
1274 * also warns that it's only safe if the permission
1275 * and cache attributes of the two entries loaded in
1276 * the two TLB is identical (which should be the case
1277 * here). But it is generally safer to never allow
1278 * small and huge TLB entries for the same virtual
1279 * address to be loaded simultaneously. So instead of
1280 * doing "pmd_populate(); flush_tlb_range();" we first
1281 * mark the current pmd notpresent (atomically because
1282 * here the pmd_trans_huge and pmd_trans_splitting
1283 * must remain set at all times on the pmd until the
1284 * split is complete for this pmd), then we flush the
1285 * SMP TLB and finally we write the non-huge version
1286 * of the pmd entry with pmd_populate.
1287 */
1288 set_pmd_at(mm, address, pmd, pmd_mknotpresent(*pmd));
1289 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
1290 pmd_populate(mm, pmd, pgtable);
1291 ret = 1;
1292 }
1293 spin_unlock(&mm->page_table_lock);
1294
1295 return ret;
1296}
1297
1298/* must be called with anon_vma->root->lock hold */
1299static void __split_huge_page(struct page *page,
1300 struct anon_vma *anon_vma)
1301{
1302 int mapcount, mapcount2;
1303 struct anon_vma_chain *avc;
1304
1305 BUG_ON(!PageHead(page));
1306 BUG_ON(PageTail(page));
1307
1308 mapcount = 0;
1309 list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
1310 struct vm_area_struct *vma = avc->vma;
1311 unsigned long addr = vma_address(page, vma);
1312 BUG_ON(is_vma_temporary_stack(vma));
1313 if (addr == -EFAULT)
1314 continue;
1315 mapcount += __split_huge_page_splitting(page, vma, addr);
1316 }
05759d38
AA
1317 /*
1318 * It is critical that new vmas are added to the tail of the
1319 * anon_vma list. This guarantes that if copy_huge_pmd() runs
1320 * and establishes a child pmd before
1321 * __split_huge_page_splitting() freezes the parent pmd (so if
1322 * we fail to prevent copy_huge_pmd() from running until the
1323 * whole __split_huge_page() is complete), we will still see
1324 * the newly established pmd of the child later during the
1325 * walk, to be able to set it as pmd_trans_splitting too.
1326 */
1327 if (mapcount != page_mapcount(page))
1328 printk(KERN_ERR "mapcount %d page_mapcount %d\n",
1329 mapcount, page_mapcount(page));
71e3aac0
AA
1330 BUG_ON(mapcount != page_mapcount(page));
1331
1332 __split_huge_page_refcount(page);
1333
1334 mapcount2 = 0;
1335 list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
1336 struct vm_area_struct *vma = avc->vma;
1337 unsigned long addr = vma_address(page, vma);
1338 BUG_ON(is_vma_temporary_stack(vma));
1339 if (addr == -EFAULT)
1340 continue;
1341 mapcount2 += __split_huge_page_map(page, vma, addr);
1342 }
05759d38
AA
1343 if (mapcount != mapcount2)
1344 printk(KERN_ERR "mapcount %d mapcount2 %d page_mapcount %d\n",
1345 mapcount, mapcount2, page_mapcount(page));
71e3aac0
AA
1346 BUG_ON(mapcount != mapcount2);
1347}
1348
1349int split_huge_page(struct page *page)
1350{
1351 struct anon_vma *anon_vma;
1352 int ret = 1;
1353
1354 BUG_ON(!PageAnon(page));
1355 anon_vma = page_lock_anon_vma(page);
1356 if (!anon_vma)
1357 goto out;
1358 ret = 0;
1359 if (!PageCompound(page))
1360 goto out_unlock;
1361
1362 BUG_ON(!PageSwapBacked(page));
1363 __split_huge_page(page, anon_vma);
1364
1365 BUG_ON(PageCompound(page));
1366out_unlock:
1367 page_unlock_anon_vma(anon_vma);
1368out:
1369 return ret;
1370}
1371
0af4e98b
AA
1372int hugepage_madvise(unsigned long *vm_flags)
1373{
1374 /*
1375 * Be somewhat over-protective like KSM for now!
1376 */
1377 if (*vm_flags & (VM_HUGEPAGE | VM_SHARED | VM_MAYSHARE |
1378 VM_PFNMAP | VM_IO | VM_DONTEXPAND |
1379 VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE |
1380 VM_MIXEDMAP | VM_SAO))
1381 return -EINVAL;
1382
1383 *vm_flags |= VM_HUGEPAGE;
1384
1385 return 0;
1386}
1387
ba76149f
AA
1388static int __init khugepaged_slab_init(void)
1389{
1390 mm_slot_cache = kmem_cache_create("khugepaged_mm_slot",
1391 sizeof(struct mm_slot),
1392 __alignof__(struct mm_slot), 0, NULL);
1393 if (!mm_slot_cache)
1394 return -ENOMEM;
1395
1396 return 0;
1397}
1398
1399static void __init khugepaged_slab_free(void)
1400{
1401 kmem_cache_destroy(mm_slot_cache);
1402 mm_slot_cache = NULL;
1403}
1404
1405static inline struct mm_slot *alloc_mm_slot(void)
1406{
1407 if (!mm_slot_cache) /* initialization failed */
1408 return NULL;
1409 return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
1410}
1411
1412static inline void free_mm_slot(struct mm_slot *mm_slot)
1413{
1414 kmem_cache_free(mm_slot_cache, mm_slot);
1415}
1416
1417static int __init mm_slots_hash_init(void)
1418{
1419 mm_slots_hash = kzalloc(MM_SLOTS_HASH_HEADS * sizeof(struct hlist_head),
1420 GFP_KERNEL);
1421 if (!mm_slots_hash)
1422 return -ENOMEM;
1423 return 0;
1424}
1425
1426#if 0
1427static void __init mm_slots_hash_free(void)
1428{
1429 kfree(mm_slots_hash);
1430 mm_slots_hash = NULL;
1431}
1432#endif
1433
1434static struct mm_slot *get_mm_slot(struct mm_struct *mm)
1435{
1436 struct mm_slot *mm_slot;
1437 struct hlist_head *bucket;
1438 struct hlist_node *node;
1439
1440 bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
1441 % MM_SLOTS_HASH_HEADS];
1442 hlist_for_each_entry(mm_slot, node, bucket, hash) {
1443 if (mm == mm_slot->mm)
1444 return mm_slot;
1445 }
1446 return NULL;
1447}
1448
1449static void insert_to_mm_slots_hash(struct mm_struct *mm,
1450 struct mm_slot *mm_slot)
1451{
1452 struct hlist_head *bucket;
1453
1454 bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
1455 % MM_SLOTS_HASH_HEADS];
1456 mm_slot->mm = mm;
1457 hlist_add_head(&mm_slot->hash, bucket);
1458}
1459
1460static inline int khugepaged_test_exit(struct mm_struct *mm)
1461{
1462 return atomic_read(&mm->mm_users) == 0;
1463}
1464
1465int __khugepaged_enter(struct mm_struct *mm)
1466{
1467 struct mm_slot *mm_slot;
1468 int wakeup;
1469
1470 mm_slot = alloc_mm_slot();
1471 if (!mm_slot)
1472 return -ENOMEM;
1473
1474 /* __khugepaged_exit() must not run from under us */
1475 VM_BUG_ON(khugepaged_test_exit(mm));
1476 if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) {
1477 free_mm_slot(mm_slot);
1478 return 0;
1479 }
1480
1481 spin_lock(&khugepaged_mm_lock);
1482 insert_to_mm_slots_hash(mm, mm_slot);
1483 /*
1484 * Insert just behind the scanning cursor, to let the area settle
1485 * down a little.
1486 */
1487 wakeup = list_empty(&khugepaged_scan.mm_head);
1488 list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head);
1489 spin_unlock(&khugepaged_mm_lock);
1490
1491 atomic_inc(&mm->mm_count);
1492 if (wakeup)
1493 wake_up_interruptible(&khugepaged_wait);
1494
1495 return 0;
1496}
1497
1498int khugepaged_enter_vma_merge(struct vm_area_struct *vma)
1499{
1500 unsigned long hstart, hend;
1501 if (!vma->anon_vma)
1502 /*
1503 * Not yet faulted in so we will register later in the
1504 * page fault if needed.
1505 */
1506 return 0;
1507 if (vma->vm_file || vma->vm_ops)
1508 /* khugepaged not yet working on file or special mappings */
1509 return 0;
1510 VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma));
1511 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
1512 hend = vma->vm_end & HPAGE_PMD_MASK;
1513 if (hstart < hend)
1514 return khugepaged_enter(vma);
1515 return 0;
1516}
1517
1518void __khugepaged_exit(struct mm_struct *mm)
1519{
1520 struct mm_slot *mm_slot;
1521 int free = 0;
1522
1523 spin_lock(&khugepaged_mm_lock);
1524 mm_slot = get_mm_slot(mm);
1525 if (mm_slot && khugepaged_scan.mm_slot != mm_slot) {
1526 hlist_del(&mm_slot->hash);
1527 list_del(&mm_slot->mm_node);
1528 free = 1;
1529 }
1530
1531 if (free) {
1532 spin_unlock(&khugepaged_mm_lock);
1533 clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
1534 free_mm_slot(mm_slot);
1535 mmdrop(mm);
1536 } else if (mm_slot) {
1537 spin_unlock(&khugepaged_mm_lock);
1538 /*
1539 * This is required to serialize against
1540 * khugepaged_test_exit() (which is guaranteed to run
1541 * under mmap sem read mode). Stop here (after we
1542 * return all pagetables will be destroyed) until
1543 * khugepaged has finished working on the pagetables
1544 * under the mmap_sem.
1545 */
1546 down_write(&mm->mmap_sem);
1547 up_write(&mm->mmap_sem);
1548 } else
1549 spin_unlock(&khugepaged_mm_lock);
1550}
1551
1552static void release_pte_page(struct page *page)
1553{
1554 /* 0 stands for page_is_file_cache(page) == false */
1555 dec_zone_page_state(page, NR_ISOLATED_ANON + 0);
1556 unlock_page(page);
1557 putback_lru_page(page);
1558}
1559
1560static void release_pte_pages(pte_t *pte, pte_t *_pte)
1561{
1562 while (--_pte >= pte) {
1563 pte_t pteval = *_pte;
1564 if (!pte_none(pteval))
1565 release_pte_page(pte_page(pteval));
1566 }
1567}
1568
1569static void release_all_pte_pages(pte_t *pte)
1570{
1571 release_pte_pages(pte, pte + HPAGE_PMD_NR);
1572}
1573
1574static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
1575 unsigned long address,
1576 pte_t *pte)
1577{
1578 struct page *page;
1579 pte_t *_pte;
1580 int referenced = 0, isolated = 0, none = 0;
1581 for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
1582 _pte++, address += PAGE_SIZE) {
1583 pte_t pteval = *_pte;
1584 if (pte_none(pteval)) {
1585 if (++none <= khugepaged_max_ptes_none)
1586 continue;
1587 else {
1588 release_pte_pages(pte, _pte);
1589 goto out;
1590 }
1591 }
1592 if (!pte_present(pteval) || !pte_write(pteval)) {
1593 release_pte_pages(pte, _pte);
1594 goto out;
1595 }
1596 page = vm_normal_page(vma, address, pteval);
1597 if (unlikely(!page)) {
1598 release_pte_pages(pte, _pte);
1599 goto out;
1600 }
1601 VM_BUG_ON(PageCompound(page));
1602 BUG_ON(!PageAnon(page));
1603 VM_BUG_ON(!PageSwapBacked(page));
1604
1605 /* cannot use mapcount: can't collapse if there's a gup pin */
1606 if (page_count(page) != 1) {
1607 release_pte_pages(pte, _pte);
1608 goto out;
1609 }
1610 /*
1611 * We can do it before isolate_lru_page because the
1612 * page can't be freed from under us. NOTE: PG_lock
1613 * is needed to serialize against split_huge_page
1614 * when invoked from the VM.
1615 */
1616 if (!trylock_page(page)) {
1617 release_pte_pages(pte, _pte);
1618 goto out;
1619 }
1620 /*
1621 * Isolate the page to avoid collapsing an hugepage
1622 * currently in use by the VM.
1623 */
1624 if (isolate_lru_page(page)) {
1625 unlock_page(page);
1626 release_pte_pages(pte, _pte);
1627 goto out;
1628 }
1629 /* 0 stands for page_is_file_cache(page) == false */
1630 inc_zone_page_state(page, NR_ISOLATED_ANON + 0);
1631 VM_BUG_ON(!PageLocked(page));
1632 VM_BUG_ON(PageLRU(page));
1633
1634 /* If there is no mapped pte young don't collapse the page */
1635 if (pte_young(pteval))
1636 referenced = 1;
1637 }
1638 if (unlikely(!referenced))
1639 release_all_pte_pages(pte);
1640 else
1641 isolated = 1;
1642out:
1643 return isolated;
1644}
1645
1646static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
1647 struct vm_area_struct *vma,
1648 unsigned long address,
1649 spinlock_t *ptl)
1650{
1651 pte_t *_pte;
1652 for (_pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++) {
1653 pte_t pteval = *_pte;
1654 struct page *src_page;
1655
1656 if (pte_none(pteval)) {
1657 clear_user_highpage(page, address);
1658 add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
1659 } else {
1660 src_page = pte_page(pteval);
1661 copy_user_highpage(page, src_page, address, vma);
1662 VM_BUG_ON(page_mapcount(src_page) != 1);
1663 VM_BUG_ON(page_count(src_page) != 2);
1664 release_pte_page(src_page);
1665 /*
1666 * ptl mostly unnecessary, but preempt has to
1667 * be disabled to update the per-cpu stats
1668 * inside page_remove_rmap().
1669 */
1670 spin_lock(ptl);
1671 /*
1672 * paravirt calls inside pte_clear here are
1673 * superfluous.
1674 */
1675 pte_clear(vma->vm_mm, address, _pte);
1676 page_remove_rmap(src_page);
1677 spin_unlock(ptl);
1678 free_page_and_swap_cache(src_page);
1679 }
1680
1681 address += PAGE_SIZE;
1682 page++;
1683 }
1684}
1685
1686static void collapse_huge_page(struct mm_struct *mm,
1687 unsigned long address,
ce83d217
AA
1688 struct page **hpage,
1689 struct vm_area_struct *vma)
ba76149f 1690{
ba76149f
AA
1691 pgd_t *pgd;
1692 pud_t *pud;
1693 pmd_t *pmd, _pmd;
1694 pte_t *pte;
1695 pgtable_t pgtable;
1696 struct page *new_page;
1697 spinlock_t *ptl;
1698 int isolated;
1699 unsigned long hstart, hend;
1700
1701 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
0bbbc0b3 1702#ifndef CONFIG_NUMA
ba76149f 1703 VM_BUG_ON(!*hpage);
ce83d217 1704 new_page = *hpage;
0bbbc0b3
AA
1705#else
1706 VM_BUG_ON(*hpage);
ce83d217
AA
1707 /*
1708 * Allocate the page while the vma is still valid and under
1709 * the mmap_sem read mode so there is no memory allocation
1710 * later when we take the mmap_sem in write mode. This is more
1711 * friendly behavior (OTOH it may actually hide bugs) to
1712 * filesystems in userland with daemons allocating memory in
1713 * the userland I/O paths. Allocating memory with the
1714 * mmap_sem in read mode is good idea also to allow greater
1715 * scalability.
1716 */
1717 new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address);
1718 if (unlikely(!new_page)) {
1719 up_read(&mm->mmap_sem);
1720 *hpage = ERR_PTR(-ENOMEM);
1721 return;
1722 }
0bbbc0b3 1723#endif
ce83d217
AA
1724 if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
1725 up_read(&mm->mmap_sem);
1726 put_page(new_page);
1727 return;
1728 }
1729
1730 /* after allocating the hugepage upgrade to mmap_sem write mode */
1731 up_read(&mm->mmap_sem);
ba76149f
AA
1732
1733 /*
1734 * Prevent all access to pagetables with the exception of
1735 * gup_fast later hanlded by the ptep_clear_flush and the VM
1736 * handled by the anon_vma lock + PG_lock.
1737 */
1738 down_write(&mm->mmap_sem);
1739 if (unlikely(khugepaged_test_exit(mm)))
1740 goto out;
1741
1742 vma = find_vma(mm, address);
1743 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
1744 hend = vma->vm_end & HPAGE_PMD_MASK;
1745 if (address < hstart || address + HPAGE_PMD_SIZE > hend)
1746 goto out;
1747
1748 if (!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always())
1749 goto out;
1750
1751 /* VM_PFNMAP vmas may have vm_ops null but vm_file set */
1752 if (!vma->anon_vma || vma->vm_ops || vma->vm_file)
1753 goto out;
1754 VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma));
1755
1756 pgd = pgd_offset(mm, address);
1757 if (!pgd_present(*pgd))
1758 goto out;
1759
1760 pud = pud_offset(pgd, address);
1761 if (!pud_present(*pud))
1762 goto out;
1763
1764 pmd = pmd_offset(pud, address);
1765 /* pmd can't go away or become huge under us */
1766 if (!pmd_present(*pmd) || pmd_trans_huge(*pmd))
1767 goto out;
1768
ba76149f
AA
1769 anon_vma_lock(vma->anon_vma);
1770
1771 pte = pte_offset_map(pmd, address);
1772 ptl = pte_lockptr(mm, pmd);
1773
1774 spin_lock(&mm->page_table_lock); /* probably unnecessary */
1775 /*
1776 * After this gup_fast can't run anymore. This also removes
1777 * any huge TLB entry from the CPU so we won't allow
1778 * huge and small TLB entries for the same virtual address
1779 * to avoid the risk of CPU bugs in that area.
1780 */
1781 _pmd = pmdp_clear_flush_notify(vma, address, pmd);
1782 spin_unlock(&mm->page_table_lock);
1783
1784 spin_lock(ptl);
1785 isolated = __collapse_huge_page_isolate(vma, address, pte);
1786 spin_unlock(ptl);
1787 pte_unmap(pte);
1788
1789 if (unlikely(!isolated)) {
1790 spin_lock(&mm->page_table_lock);
1791 BUG_ON(!pmd_none(*pmd));
1792 set_pmd_at(mm, address, pmd, _pmd);
1793 spin_unlock(&mm->page_table_lock);
1794 anon_vma_unlock(vma->anon_vma);
1795 mem_cgroup_uncharge_page(new_page);
ce83d217 1796 goto out;
ba76149f
AA
1797 }
1798
1799 /*
1800 * All pages are isolated and locked so anon_vma rmap
1801 * can't run anymore.
1802 */
1803 anon_vma_unlock(vma->anon_vma);
1804
1805 __collapse_huge_page_copy(pte, new_page, vma, address, ptl);
1806 __SetPageUptodate(new_page);
1807 pgtable = pmd_pgtable(_pmd);
1808 VM_BUG_ON(page_count(pgtable) != 1);
1809 VM_BUG_ON(page_mapcount(pgtable) != 0);
1810
1811 _pmd = mk_pmd(new_page, vma->vm_page_prot);
1812 _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
1813 _pmd = pmd_mkhuge(_pmd);
1814
1815 /*
1816 * spin_lock() below is not the equivalent of smp_wmb(), so
1817 * this is needed to avoid the copy_huge_page writes to become
1818 * visible after the set_pmd_at() write.
1819 */
1820 smp_wmb();
1821
1822 spin_lock(&mm->page_table_lock);
1823 BUG_ON(!pmd_none(*pmd));
1824 page_add_new_anon_rmap(new_page, vma, address);
1825 set_pmd_at(mm, address, pmd, _pmd);
1826 update_mmu_cache(vma, address, entry);
1827 prepare_pmd_huge_pte(pgtable, mm);
1828 mm->nr_ptes--;
1829 spin_unlock(&mm->page_table_lock);
1830
0bbbc0b3 1831#ifndef CONFIG_NUMA
ba76149f 1832 *hpage = NULL;
0bbbc0b3 1833#endif
ba76149f 1834 khugepaged_pages_collapsed++;
ce83d217 1835out_up_write:
ba76149f 1836 up_write(&mm->mmap_sem);
0bbbc0b3
AA
1837 return;
1838
ce83d217 1839out:
0bbbc0b3
AA
1840#ifdef CONFIG_NUMA
1841 put_page(new_page);
1842#endif
ce83d217 1843 goto out_up_write;
ba76149f
AA
1844}
1845
1846static int khugepaged_scan_pmd(struct mm_struct *mm,
1847 struct vm_area_struct *vma,
1848 unsigned long address,
1849 struct page **hpage)
1850{
1851 pgd_t *pgd;
1852 pud_t *pud;
1853 pmd_t *pmd;
1854 pte_t *pte, *_pte;
1855 int ret = 0, referenced = 0, none = 0;
1856 struct page *page;
1857 unsigned long _address;
1858 spinlock_t *ptl;
1859
1860 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
1861
1862 pgd = pgd_offset(mm, address);
1863 if (!pgd_present(*pgd))
1864 goto out;
1865
1866 pud = pud_offset(pgd, address);
1867 if (!pud_present(*pud))
1868 goto out;
1869
1870 pmd = pmd_offset(pud, address);
1871 if (!pmd_present(*pmd) || pmd_trans_huge(*pmd))
1872 goto out;
1873
1874 pte = pte_offset_map_lock(mm, pmd, address, &ptl);
1875 for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR;
1876 _pte++, _address += PAGE_SIZE) {
1877 pte_t pteval = *_pte;
1878 if (pte_none(pteval)) {
1879 if (++none <= khugepaged_max_ptes_none)
1880 continue;
1881 else
1882 goto out_unmap;
1883 }
1884 if (!pte_present(pteval) || !pte_write(pteval))
1885 goto out_unmap;
1886 page = vm_normal_page(vma, _address, pteval);
1887 if (unlikely(!page))
1888 goto out_unmap;
1889 VM_BUG_ON(PageCompound(page));
1890 if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
1891 goto out_unmap;
1892 /* cannot use mapcount: can't collapse if there's a gup pin */
1893 if (page_count(page) != 1)
1894 goto out_unmap;
1895 if (pte_young(pteval))
1896 referenced = 1;
1897 }
1898 if (referenced)
1899 ret = 1;
1900out_unmap:
1901 pte_unmap_unlock(pte, ptl);
ce83d217
AA
1902 if (ret)
1903 /* collapse_huge_page will return with the mmap_sem released */
1904 collapse_huge_page(mm, address, hpage, vma);
ba76149f
AA
1905out:
1906 return ret;
1907}
1908
1909static void collect_mm_slot(struct mm_slot *mm_slot)
1910{
1911 struct mm_struct *mm = mm_slot->mm;
1912
1913 VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock));
1914
1915 if (khugepaged_test_exit(mm)) {
1916 /* free mm_slot */
1917 hlist_del(&mm_slot->hash);
1918 list_del(&mm_slot->mm_node);
1919
1920 /*
1921 * Not strictly needed because the mm exited already.
1922 *
1923 * clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
1924 */
1925
1926 /* khugepaged_mm_lock actually not necessary for the below */
1927 free_mm_slot(mm_slot);
1928 mmdrop(mm);
1929 }
1930}
1931
1932static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
1933 struct page **hpage)
1934{
1935 struct mm_slot *mm_slot;
1936 struct mm_struct *mm;
1937 struct vm_area_struct *vma;
1938 int progress = 0;
1939
1940 VM_BUG_ON(!pages);
1941 VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock));
1942
1943 if (khugepaged_scan.mm_slot)
1944 mm_slot = khugepaged_scan.mm_slot;
1945 else {
1946 mm_slot = list_entry(khugepaged_scan.mm_head.next,
1947 struct mm_slot, mm_node);
1948 khugepaged_scan.address = 0;
1949 khugepaged_scan.mm_slot = mm_slot;
1950 }
1951 spin_unlock(&khugepaged_mm_lock);
1952
1953 mm = mm_slot->mm;
1954 down_read(&mm->mmap_sem);
1955 if (unlikely(khugepaged_test_exit(mm)))
1956 vma = NULL;
1957 else
1958 vma = find_vma(mm, khugepaged_scan.address);
1959
1960 progress++;
1961 for (; vma; vma = vma->vm_next) {
1962 unsigned long hstart, hend;
1963
1964 cond_resched();
1965 if (unlikely(khugepaged_test_exit(mm))) {
1966 progress++;
1967 break;
1968 }
1969
1970 if (!(vma->vm_flags & VM_HUGEPAGE) &&
1971 !khugepaged_always()) {
1972 progress++;
1973 continue;
1974 }
1975
1976 /* VM_PFNMAP vmas may have vm_ops null but vm_file set */
1977 if (!vma->anon_vma || vma->vm_ops || vma->vm_file) {
1978 khugepaged_scan.address = vma->vm_end;
1979 progress++;
1980 continue;
1981 }
1982 VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma));
1983
1984 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
1985 hend = vma->vm_end & HPAGE_PMD_MASK;
1986 if (hstart >= hend) {
1987 progress++;
1988 continue;
1989 }
1990 if (khugepaged_scan.address < hstart)
1991 khugepaged_scan.address = hstart;
1992 if (khugepaged_scan.address > hend) {
1993 khugepaged_scan.address = hend + HPAGE_PMD_SIZE;
1994 progress++;
1995 continue;
1996 }
1997 BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
1998
1999 while (khugepaged_scan.address < hend) {
2000 int ret;
2001 cond_resched();
2002 if (unlikely(khugepaged_test_exit(mm)))
2003 goto breakouterloop;
2004
2005 VM_BUG_ON(khugepaged_scan.address < hstart ||
2006 khugepaged_scan.address + HPAGE_PMD_SIZE >
2007 hend);
2008 ret = khugepaged_scan_pmd(mm, vma,
2009 khugepaged_scan.address,
2010 hpage);
2011 /* move to next address */
2012 khugepaged_scan.address += HPAGE_PMD_SIZE;
2013 progress += HPAGE_PMD_NR;
2014 if (ret)
2015 /* we released mmap_sem so break loop */
2016 goto breakouterloop_mmap_sem;
2017 if (progress >= pages)
2018 goto breakouterloop;
2019 }
2020 }
2021breakouterloop:
2022 up_read(&mm->mmap_sem); /* exit_mmap will destroy ptes after this */
2023breakouterloop_mmap_sem:
2024
2025 spin_lock(&khugepaged_mm_lock);
2026 BUG_ON(khugepaged_scan.mm_slot != mm_slot);
2027 /*
2028 * Release the current mm_slot if this mm is about to die, or
2029 * if we scanned all vmas of this mm.
2030 */
2031 if (khugepaged_test_exit(mm) || !vma) {
2032 /*
2033 * Make sure that if mm_users is reaching zero while
2034 * khugepaged runs here, khugepaged_exit will find
2035 * mm_slot not pointing to the exiting mm.
2036 */
2037 if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) {
2038 khugepaged_scan.mm_slot = list_entry(
2039 mm_slot->mm_node.next,
2040 struct mm_slot, mm_node);
2041 khugepaged_scan.address = 0;
2042 } else {
2043 khugepaged_scan.mm_slot = NULL;
2044 khugepaged_full_scans++;
2045 }
2046
2047 collect_mm_slot(mm_slot);
2048 }
2049
2050 return progress;
2051}
2052
2053static int khugepaged_has_work(void)
2054{
2055 return !list_empty(&khugepaged_scan.mm_head) &&
2056 khugepaged_enabled();
2057}
2058
2059static int khugepaged_wait_event(void)
2060{
2061 return !list_empty(&khugepaged_scan.mm_head) ||
2062 !khugepaged_enabled();
2063}
2064
2065static void khugepaged_do_scan(struct page **hpage)
2066{
2067 unsigned int progress = 0, pass_through_head = 0;
2068 unsigned int pages = khugepaged_pages_to_scan;
2069
2070 barrier(); /* write khugepaged_pages_to_scan to local stack */
2071
2072 while (progress < pages) {
2073 cond_resched();
2074
0bbbc0b3 2075#ifndef CONFIG_NUMA
ba76149f
AA
2076 if (!*hpage) {
2077 *hpage = alloc_hugepage(khugepaged_defrag());
2078 if (unlikely(!*hpage))
2079 break;
2080 }
0bbbc0b3
AA
2081#else
2082 if (IS_ERR(*hpage))
2083 break;
2084#endif
ba76149f
AA
2085
2086 spin_lock(&khugepaged_mm_lock);
2087 if (!khugepaged_scan.mm_slot)
2088 pass_through_head++;
2089 if (khugepaged_has_work() &&
2090 pass_through_head < 2)
2091 progress += khugepaged_scan_mm_slot(pages - progress,
2092 hpage);
2093 else
2094 progress = pages;
2095 spin_unlock(&khugepaged_mm_lock);
2096 }
2097}
2098
0bbbc0b3
AA
2099static void khugepaged_alloc_sleep(void)
2100{
2101 DEFINE_WAIT(wait);
2102 add_wait_queue(&khugepaged_wait, &wait);
2103 schedule_timeout_interruptible(
2104 msecs_to_jiffies(
2105 khugepaged_alloc_sleep_millisecs));
2106 remove_wait_queue(&khugepaged_wait, &wait);
2107}
2108
2109#ifndef CONFIG_NUMA
ba76149f
AA
2110static struct page *khugepaged_alloc_hugepage(void)
2111{
2112 struct page *hpage;
2113
2114 do {
2115 hpage = alloc_hugepage(khugepaged_defrag());
0bbbc0b3
AA
2116 if (!hpage)
2117 khugepaged_alloc_sleep();
ba76149f
AA
2118 } while (unlikely(!hpage) &&
2119 likely(khugepaged_enabled()));
2120 return hpage;
2121}
0bbbc0b3 2122#endif
ba76149f
AA
2123
2124static void khugepaged_loop(void)
2125{
2126 struct page *hpage;
2127
0bbbc0b3
AA
2128#ifdef CONFIG_NUMA
2129 hpage = NULL;
2130#endif
ba76149f 2131 while (likely(khugepaged_enabled())) {
0bbbc0b3 2132#ifndef CONFIG_NUMA
ba76149f
AA
2133 hpage = khugepaged_alloc_hugepage();
2134 if (unlikely(!hpage))
2135 break;
0bbbc0b3
AA
2136#else
2137 if (IS_ERR(hpage)) {
2138 khugepaged_alloc_sleep();
2139 hpage = NULL;
2140 }
2141#endif
ba76149f
AA
2142
2143 khugepaged_do_scan(&hpage);
0bbbc0b3 2144#ifndef CONFIG_NUMA
ba76149f
AA
2145 if (hpage)
2146 put_page(hpage);
0bbbc0b3 2147#endif
ba76149f
AA
2148 if (khugepaged_has_work()) {
2149 DEFINE_WAIT(wait);
2150 if (!khugepaged_scan_sleep_millisecs)
2151 continue;
2152 add_wait_queue(&khugepaged_wait, &wait);
2153 schedule_timeout_interruptible(
2154 msecs_to_jiffies(
2155 khugepaged_scan_sleep_millisecs));
2156 remove_wait_queue(&khugepaged_wait, &wait);
2157 } else if (khugepaged_enabled())
2158 wait_event_interruptible(khugepaged_wait,
2159 khugepaged_wait_event());
2160 }
2161}
2162
2163static int khugepaged(void *none)
2164{
2165 struct mm_slot *mm_slot;
2166
2167 set_user_nice(current, 19);
2168
2169 /* serialize with start_khugepaged() */
2170 mutex_lock(&khugepaged_mutex);
2171
2172 for (;;) {
2173 mutex_unlock(&khugepaged_mutex);
2174 BUG_ON(khugepaged_thread != current);
2175 khugepaged_loop();
2176 BUG_ON(khugepaged_thread != current);
2177
2178 mutex_lock(&khugepaged_mutex);
2179 if (!khugepaged_enabled())
2180 break;
2181 }
2182
2183 spin_lock(&khugepaged_mm_lock);
2184 mm_slot = khugepaged_scan.mm_slot;
2185 khugepaged_scan.mm_slot = NULL;
2186 if (mm_slot)
2187 collect_mm_slot(mm_slot);
2188 spin_unlock(&khugepaged_mm_lock);
2189
2190 khugepaged_thread = NULL;
2191 mutex_unlock(&khugepaged_mutex);
2192
2193 return 0;
2194}
2195
71e3aac0
AA
2196void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd)
2197{
2198 struct page *page;
2199
2200 spin_lock(&mm->page_table_lock);
2201 if (unlikely(!pmd_trans_huge(*pmd))) {
2202 spin_unlock(&mm->page_table_lock);
2203 return;
2204 }
2205 page = pmd_page(*pmd);
2206 VM_BUG_ON(!page_count(page));
2207 get_page(page);
2208 spin_unlock(&mm->page_table_lock);
2209
2210 split_huge_page(page);
2211
2212 put_page(page);
2213 BUG_ON(pmd_trans_huge(*pmd));
2214}
94fcc585
AA
2215
2216static void split_huge_page_address(struct mm_struct *mm,
2217 unsigned long address)
2218{
2219 pgd_t *pgd;
2220 pud_t *pud;
2221 pmd_t *pmd;
2222
2223 VM_BUG_ON(!(address & ~HPAGE_PMD_MASK));
2224
2225 pgd = pgd_offset(mm, address);
2226 if (!pgd_present(*pgd))
2227 return;
2228
2229 pud = pud_offset(pgd, address);
2230 if (!pud_present(*pud))
2231 return;
2232
2233 pmd = pmd_offset(pud, address);
2234 if (!pmd_present(*pmd))
2235 return;
2236 /*
2237 * Caller holds the mmap_sem write mode, so a huge pmd cannot
2238 * materialize from under us.
2239 */
2240 split_huge_page_pmd(mm, pmd);
2241}
2242
2243void __vma_adjust_trans_huge(struct vm_area_struct *vma,
2244 unsigned long start,
2245 unsigned long end,
2246 long adjust_next)
2247{
2248 /*
2249 * If the new start address isn't hpage aligned and it could
2250 * previously contain an hugepage: check if we need to split
2251 * an huge pmd.
2252 */
2253 if (start & ~HPAGE_PMD_MASK &&
2254 (start & HPAGE_PMD_MASK) >= vma->vm_start &&
2255 (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
2256 split_huge_page_address(vma->vm_mm, start);
2257
2258 /*
2259 * If the new end address isn't hpage aligned and it could
2260 * previously contain an hugepage: check if we need to split
2261 * an huge pmd.
2262 */
2263 if (end & ~HPAGE_PMD_MASK &&
2264 (end & HPAGE_PMD_MASK) >= vma->vm_start &&
2265 (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
2266 split_huge_page_address(vma->vm_mm, end);
2267
2268 /*
2269 * If we're also updating the vma->vm_next->vm_start, if the new
2270 * vm_next->vm_start isn't page aligned and it could previously
2271 * contain an hugepage: check if we need to split an huge pmd.
2272 */
2273 if (adjust_next > 0) {
2274 struct vm_area_struct *next = vma->vm_next;
2275 unsigned long nstart = next->vm_start;
2276 nstart += adjust_next << PAGE_SHIFT;
2277 if (nstart & ~HPAGE_PMD_MASK &&
2278 (nstart & HPAGE_PMD_MASK) >= next->vm_start &&
2279 (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end)
2280 split_huge_page_address(next->vm_mm, nstart);
2281 }
2282}
This page took 0.114567 seconds and 5 git commands to generate.