mm: fix BUG in __split_huge_page_pmd
[deliverable/linux.git] / mm / huge_memory.c
index d94f7dee3997cd7d5bcad601a339be9f58507852..610e3df2768a6a5b2ec1e293da4c96dafbbe2d30 100644 (file)
@@ -422,7 +422,7 @@ static ssize_t scan_sleep_millisecs_store(struct kobject *kobj,
        unsigned long msecs;
        int err;
 
-       err = strict_strtoul(buf, 10, &msecs);
+       err = kstrtoul(buf, 10, &msecs);
        if (err || msecs > UINT_MAX)
                return -EINVAL;
 
@@ -449,7 +449,7 @@ static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj,
        unsigned long msecs;
        int err;
 
-       err = strict_strtoul(buf, 10, &msecs);
+       err = kstrtoul(buf, 10, &msecs);
        if (err || msecs > UINT_MAX)
                return -EINVAL;
 
@@ -475,7 +475,7 @@ static ssize_t pages_to_scan_store(struct kobject *kobj,
        int err;
        unsigned long pages;
 
-       err = strict_strtoul(buf, 10, &pages);
+       err = kstrtoul(buf, 10, &pages);
        if (err || !pages || pages > UINT_MAX)
                return -EINVAL;
 
@@ -543,7 +543,7 @@ static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj,
        int err;
        unsigned long max_ptes_none;
 
-       err = strict_strtoul(buf, 10, &max_ptes_none);
+       err = kstrtoul(buf, 10, &max_ptes_none);
        if (err || max_ptes_none > HPAGE_PMD_NR-1)
                return -EINVAL;
 
@@ -695,11 +695,10 @@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
        return pmd;
 }
 
-static inline pmd_t mk_huge_pmd(struct page *page, struct vm_area_struct *vma)
+static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot)
 {
        pmd_t entry;
-       entry = mk_pmd(page, vma->vm_page_prot);
-       entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+       entry = mk_pmd(page, prot);
        entry = pmd_mkhuge(entry);
        return entry;
 }
@@ -732,7 +731,8 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
                pte_free(mm, pgtable);
        } else {
                pmd_t entry;
-               entry = mk_huge_pmd(page, vma);
+               entry = mk_huge_pmd(page, vma->vm_page_prot);
+               entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
                page_add_new_anon_rmap(page, vma, haddr);
                pgtable_trans_huge_deposit(mm, pmd, pgtable);
                set_pmd_at(mm, haddr, pmd, entry);
@@ -788,77 +788,57 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 {
        struct page *page;
        unsigned long haddr = address & HPAGE_PMD_MASK;
-       pte_t *pte;
 
-       if (haddr >= vma->vm_start && haddr + HPAGE_PMD_SIZE <= vma->vm_end) {
-               if (unlikely(anon_vma_prepare(vma)))
-                       return VM_FAULT_OOM;
-               if (unlikely(khugepaged_enter(vma)))
+       if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end)
+               return VM_FAULT_FALLBACK;
+       if (unlikely(anon_vma_prepare(vma)))
+               return VM_FAULT_OOM;
+       if (unlikely(khugepaged_enter(vma)))
+               return VM_FAULT_OOM;
+       if (!(flags & FAULT_FLAG_WRITE) &&
+                       transparent_hugepage_use_zero_page()) {
+               pgtable_t pgtable;
+               struct page *zero_page;
+               bool set;
+               pgtable = pte_alloc_one(mm, haddr);
+               if (unlikely(!pgtable))
                        return VM_FAULT_OOM;
-               if (!(flags & FAULT_FLAG_WRITE) &&
-                               transparent_hugepage_use_zero_page()) {
-                       pgtable_t pgtable;
-                       struct page *zero_page;
-                       bool set;
-                       pgtable = pte_alloc_one(mm, haddr);
-                       if (unlikely(!pgtable))
-                               return VM_FAULT_OOM;
-                       zero_page = get_huge_zero_page();
-                       if (unlikely(!zero_page)) {
-                               pte_free(mm, pgtable);
-                               count_vm_event(THP_FAULT_FALLBACK);
-                               goto out;
-                       }
-                       spin_lock(&mm->page_table_lock);
-                       set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd,
-                                       zero_page);
-                       spin_unlock(&mm->page_table_lock);
-                       if (!set) {
-                               pte_free(mm, pgtable);
-                               put_huge_zero_page();
-                       }
-                       return 0;
-               }
-               page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
-                                         vma, haddr, numa_node_id(), 0);
-               if (unlikely(!page)) {
+               zero_page = get_huge_zero_page();
+               if (unlikely(!zero_page)) {
+                       pte_free(mm, pgtable);
                        count_vm_event(THP_FAULT_FALLBACK);
-                       goto out;
+                       return VM_FAULT_FALLBACK;
                }
-               count_vm_event(THP_FAULT_ALLOC);
-               if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) {
-                       put_page(page);
-                       goto out;
-               }
-               if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd,
-                                                         page))) {
-                       mem_cgroup_uncharge_page(page);
-                       put_page(page);
-                       goto out;
+               spin_lock(&mm->page_table_lock);
+               set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd,
+                               zero_page);
+               spin_unlock(&mm->page_table_lock);
+               if (!set) {
+                       pte_free(mm, pgtable);
+                       put_huge_zero_page();
                }
-
                return 0;
        }
-out:
-       /*
-        * Use __pte_alloc instead of pte_alloc_map, because we can't
-        * run pte_offset_map on the pmd, if an huge pmd could
-        * materialize from under us from a different thread.
-        */
-       if (unlikely(pmd_none(*pmd)) &&
-           unlikely(__pte_alloc(mm, vma, pmd, address)))
-               return VM_FAULT_OOM;
-       /* if an huge pmd materialized from under us just retry later */
-       if (unlikely(pmd_trans_huge(*pmd)))
-               return 0;
-       /*
-        * A regular pmd is established and it can't morph into a huge pmd
-        * from under us anymore at this point because we hold the mmap_sem
-        * read mode and khugepaged takes it in write mode. So now it's
-        * safe to run pte_offset_map().
-        */
-       pte = pte_offset_map(pmd, address);
-       return handle_pte_fault(mm, vma, address, pte, pmd, flags);
+       page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
+                       vma, haddr, numa_node_id(), 0);
+       if (unlikely(!page)) {
+               count_vm_event(THP_FAULT_FALLBACK);
+               return VM_FAULT_FALLBACK;
+       }
+       if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) {
+               put_page(page);
+               count_vm_event(THP_FAULT_FALLBACK);
+               return VM_FAULT_FALLBACK;
+       }
+       if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page))) {
+               mem_cgroup_uncharge_page(page);
+               put_page(page);
+               count_vm_event(THP_FAULT_FALLBACK);
+               return VM_FAULT_FALLBACK;
+       }
+
+       count_vm_event(THP_FAULT_ALLOC);
+       return 0;
 }
 
 int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
@@ -1170,7 +1150,6 @@ alloc:
                new_page = NULL;
 
        if (unlikely(!new_page)) {
-               count_vm_event(THP_FAULT_FALLBACK);
                if (is_huge_zero_pmd(orig_pmd)) {
                        ret = do_huge_pmd_wp_zero_page_fallback(mm, vma,
                                        address, pmd, orig_pmd, haddr);
@@ -1181,9 +1160,9 @@ alloc:
                                split_huge_page(page);
                        put_page(page);
                }
+               count_vm_event(THP_FAULT_FALLBACK);
                goto out;
        }
-       count_vm_event(THP_FAULT_ALLOC);
 
        if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
                put_page(new_page);
@@ -1191,10 +1170,13 @@ alloc:
                        split_huge_page(page);
                        put_page(page);
                }
+               count_vm_event(THP_FAULT_FALLBACK);
                ret |= VM_FAULT_OOM;
                goto out;
        }
 
+       count_vm_event(THP_FAULT_ALLOC);
+
        if (is_huge_zero_pmd(orig_pmd))
                clear_huge_page(new_page, haddr, HPAGE_PMD_NR);
        else
@@ -1215,7 +1197,8 @@ alloc:
                goto out_mn;
        } else {
                pmd_t entry;
-               entry = mk_huge_pmd(new_page, vma);
+               entry = mk_huge_pmd(new_page, vma->vm_page_prot);
+               entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
                pmdp_clear_flush(vma, haddr, pmd);
                page_add_new_anon_rmap(new_page, vma, haddr);
                set_pmd_at(mm, haddr, pmd, entry);
@@ -1666,7 +1649,6 @@ static void __split_huge_page_refcount(struct page *page,
        BUG_ON(atomic_read(&page->_count) <= 0);
 
        __mod_zone_page_state(zone, NR_ANON_TRANSPARENT_HUGEPAGES, -1);
-       __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR);
 
        ClearPageCompound(page);
        compound_unlock(page);
@@ -2301,6 +2283,8 @@ static void collapse_huge_page(struct mm_struct *mm,
                goto out;
 
        vma = find_vma(mm, address);
+       if (!vma)
+               goto out;
        hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
        hend = vma->vm_end & HPAGE_PMD_MASK;
        if (address < hstart || address + HPAGE_PMD_SIZE > hend)
@@ -2362,7 +2346,8 @@ static void collapse_huge_page(struct mm_struct *mm,
        __SetPageUptodate(new_page);
        pgtable = pmd_pgtable(_pmd);
 
-       _pmd = mk_huge_pmd(new_page, vma);
+       _pmd = mk_huge_pmd(new_page, vma->vm_page_prot);
+       _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
 
        /*
         * spin_lock() below is not the equivalent of smp_wmb(), so
@@ -2712,6 +2697,7 @@ void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
 
        mmun_start = haddr;
        mmun_end   = haddr + HPAGE_PMD_SIZE;
+again:
        mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
        spin_lock(&mm->page_table_lock);
        if (unlikely(!pmd_trans_huge(*pmd))) {
@@ -2734,7 +2720,14 @@ void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
        split_huge_page(page);
 
        put_page(page);
-       BUG_ON(pmd_trans_huge(*pmd));
+
+       /*
+        * We don't always have down_write of mmap_sem here: a racing
+        * do_huge_pmd_wp_page() might have copied-on-write to another
+        * huge page before our split_huge_page() got the anon_vma lock.
+        */
+       if (unlikely(pmd_trans_huge(*pmd)))
+               goto again;
 }
 
 void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address,
This page took 0.033734 seconds and 5 git commands to generate.