Commit | Line | Data |
---|---|---|
14cf11af PM |
1 | /* |
2 | * This file contains ioremap and related functions for 64-bit machines. | |
3 | * | |
4 | * Derived from arch/ppc64/mm/init.c | |
5 | * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) | |
6 | * | |
7 | * Modifications by Paul Mackerras (PowerMac) (paulus@samba.org) | |
8 | * and Cort Dougan (PReP) (cort@cs.nmt.edu) | |
9 | * Copyright (C) 1996 Paul Mackerras | |
14cf11af PM |
10 | * |
11 | * Derived from "arch/i386/mm/init.c" | |
12 | * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds | |
13 | * | |
14 | * Dave Engebretsen <engebret@us.ibm.com> | |
15 | * Rework for PPC64 port. | |
16 | * | |
17 | * This program is free software; you can redistribute it and/or | |
18 | * modify it under the terms of the GNU General Public License | |
19 | * as published by the Free Software Foundation; either version | |
20 | * 2 of the License, or (at your option) any later version. | |
21 | * | |
22 | */ | |
23 | ||
14cf11af PM |
24 | #include <linux/signal.h> |
25 | #include <linux/sched.h> | |
26 | #include <linux/kernel.h> | |
27 | #include <linux/errno.h> | |
28 | #include <linux/string.h> | |
66b15db6 | 29 | #include <linux/export.h> |
14cf11af PM |
30 | #include <linux/types.h> |
31 | #include <linux/mman.h> | |
32 | #include <linux/mm.h> | |
33 | #include <linux/swap.h> | |
34 | #include <linux/stddef.h> | |
35 | #include <linux/vmalloc.h> | |
a245067e | 36 | #include <linux/bootmem.h> |
95f72d1e | 37 | #include <linux/memblock.h> |
5a0e3ad6 | 38 | #include <linux/slab.h> |
14cf11af PM |
39 | |
40 | #include <asm/pgalloc.h> | |
41 | #include <asm/page.h> | |
42 | #include <asm/prom.h> | |
14cf11af PM |
43 | #include <asm/io.h> |
44 | #include <asm/mmu_context.h> | |
45 | #include <asm/pgtable.h> | |
46 | #include <asm/mmu.h> | |
14cf11af PM |
47 | #include <asm/smp.h> |
48 | #include <asm/machdep.h> | |
49 | #include <asm/tlb.h> | |
14cf11af | 50 | #include <asm/processor.h> |
14cf11af | 51 | #include <asm/cputable.h> |
14cf11af | 52 | #include <asm/sections.h> |
5e203d68 | 53 | #include <asm/firmware.h> |
800fc3ee DG |
54 | |
55 | #include "mmu_decl.h" | |
14cf11af | 56 | |
78f1dbde AK |
57 | /* Some sanity checking */ |
58 | #if TASK_SIZE_USER64 > PGTABLE_RANGE | |
59 | #error TASK_SIZE_USER64 exceeds pagetable range | |
60 | #endif | |
61 | ||
62 | #ifdef CONFIG_PPC_STD_MMU_64 | |
af81d787 | 63 | #if TASK_SIZE_USER64 > (1UL << (ESID_BITS + SID_SHIFT)) |
78f1dbde AK |
64 | #error TASK_SIZE_USER64 exceeds user VSID range |
65 | #endif | |
66 | #endif | |
14cf11af | 67 | |
78f1dbde | 68 | unsigned long ioremap_bot = IOREMAP_BASE; |
a245067e BH |
69 | |
70 | #ifdef CONFIG_PPC_MMU_NOHASH | |
7d176221 | 71 | static __ref void *early_alloc_pgtable(unsigned long size) |
a245067e BH |
72 | { |
73 | void *pt; | |
74 | ||
75 | if (init_bootmem_done) | |
76 | pt = __alloc_bootmem(size, size, __pa(MAX_DMA_ADDRESS)); | |
77 | else | |
95f72d1e | 78 | pt = __va(memblock_alloc_base(size, size, |
a245067e BH |
79 | __pa(MAX_DMA_ADDRESS))); |
80 | memset(pt, 0, size); | |
81 | ||
82 | return pt; | |
83 | } | |
84 | #endif /* CONFIG_PPC_MMU_NOHASH */ | |
85 | ||
14cf11af | 86 | /* |
a245067e BH |
87 | * map_kernel_page currently only called by __ioremap |
88 | * map_kernel_page adds an entry to the ioremap page table | |
14cf11af PM |
89 | * and adds an entry to the HPT, possibly bolting it |
90 | */ | |
32a74949 | 91 | int map_kernel_page(unsigned long ea, unsigned long pa, int flags) |
14cf11af PM |
92 | { |
93 | pgd_t *pgdp; | |
94 | pud_t *pudp; | |
95 | pmd_t *pmdp; | |
96 | pte_t *ptep; | |
14cf11af | 97 | |
a245067e | 98 | if (slab_is_available()) { |
14cf11af PM |
99 | pgdp = pgd_offset_k(ea); |
100 | pudp = pud_alloc(&init_mm, pgdp, ea); | |
101 | if (!pudp) | |
102 | return -ENOMEM; | |
103 | pmdp = pmd_alloc(&init_mm, pudp, ea); | |
104 | if (!pmdp) | |
105 | return -ENOMEM; | |
23fd0775 | 106 | ptep = pte_alloc_kernel(pmdp, ea); |
14cf11af PM |
107 | if (!ptep) |
108 | return -ENOMEM; | |
109 | set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, | |
110 | __pgprot(flags))); | |
14cf11af | 111 | } else { |
a245067e BH |
112 | #ifdef CONFIG_PPC_MMU_NOHASH |
113 | /* Warning ! This will blow up if bootmem is not initialized | |
114 | * which our ppc64 code is keen to do that, we'll need to | |
115 | * fix it and/or be more careful | |
116 | */ | |
117 | pgdp = pgd_offset_k(ea); | |
118 | #ifdef PUD_TABLE_SIZE | |
119 | if (pgd_none(*pgdp)) { | |
120 | pudp = early_alloc_pgtable(PUD_TABLE_SIZE); | |
121 | BUG_ON(pudp == NULL); | |
122 | pgd_populate(&init_mm, pgdp, pudp); | |
123 | } | |
124 | #endif /* PUD_TABLE_SIZE */ | |
125 | pudp = pud_offset(pgdp, ea); | |
126 | if (pud_none(*pudp)) { | |
127 | pmdp = early_alloc_pgtable(PMD_TABLE_SIZE); | |
128 | BUG_ON(pmdp == NULL); | |
129 | pud_populate(&init_mm, pudp, pmdp); | |
130 | } | |
131 | pmdp = pmd_offset(pudp, ea); | |
132 | if (!pmd_present(*pmdp)) { | |
133 | ptep = early_alloc_pgtable(PAGE_SIZE); | |
134 | BUG_ON(ptep == NULL); | |
135 | pmd_populate_kernel(&init_mm, pmdp, ptep); | |
136 | } | |
137 | ptep = pte_offset_kernel(pmdp, ea); | |
138 | set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, | |
139 | __pgprot(flags))); | |
140 | #else /* CONFIG_PPC_MMU_NOHASH */ | |
14cf11af PM |
141 | /* |
142 | * If the mm subsystem is not fully up, we cannot create a | |
143 | * linux page table entry for this mapping. Simply bolt an | |
144 | * entry in the hardware page table. | |
3c726f8d | 145 | * |
14cf11af | 146 | */ |
1189be65 PM |
147 | if (htab_bolt_mapping(ea, ea + PAGE_SIZE, pa, flags, |
148 | mmu_io_psize, mmu_kernel_ssize)) { | |
77ac166f BH |
149 | printk(KERN_ERR "Failed to do bolted mapping IO " |
150 | "memory at %016lx !\n", pa); | |
151 | return -ENOMEM; | |
152 | } | |
a245067e | 153 | #endif /* !CONFIG_PPC_MMU_NOHASH */ |
14cf11af | 154 | } |
47ce8af4 SW |
155 | |
156 | #ifdef CONFIG_PPC_BOOK3E_64 | |
157 | /* | |
158 | * With hardware tablewalk, a sync is needed to ensure that | |
159 | * subsequent accesses see the PTE we just wrote. Unlike userspace | |
160 | * mappings, we can't tolerate spurious faults, so make sure | |
161 | * the new PTE will be seen the first time. | |
162 | */ | |
163 | mb(); | |
164 | #else | |
165 | smp_wmb(); | |
166 | #endif | |
14cf11af PM |
167 | return 0; |
168 | } | |
169 | ||
170 | ||
3d5134ee BH |
171 | /** |
172 | * __ioremap_at - Low level function to establish the page tables | |
173 | * for an IO mapping | |
174 | */ | |
175 | void __iomem * __ioremap_at(phys_addr_t pa, void *ea, unsigned long size, | |
14cf11af PM |
176 | unsigned long flags) |
177 | { | |
178 | unsigned long i; | |
179 | ||
a1f242ff | 180 | /* Make sure we have the base flags */ |
14cf11af PM |
181 | if ((flags & _PAGE_PRESENT) == 0) |
182 | flags |= pgprot_val(PAGE_KERNEL); | |
183 | ||
a1f242ff BH |
184 | /* Non-cacheable page cannot be coherent */ |
185 | if (flags & _PAGE_NO_CACHE) | |
186 | flags &= ~_PAGE_COHERENT; | |
187 | ||
188 | /* We don't support the 4K PFN hack with ioremap */ | |
189 | if (flags & _PAGE_4K_PFN) | |
190 | return NULL; | |
191 | ||
3d5134ee BH |
192 | WARN_ON(pa & ~PAGE_MASK); |
193 | WARN_ON(((unsigned long)ea) & ~PAGE_MASK); | |
194 | WARN_ON(size & ~PAGE_MASK); | |
195 | ||
14cf11af | 196 | for (i = 0; i < size; i += PAGE_SIZE) |
a245067e | 197 | if (map_kernel_page((unsigned long)ea+i, pa+i, flags)) |
14cf11af PM |
198 | return NULL; |
199 | ||
3d5134ee BH |
200 | return (void __iomem *)ea; |
201 | } | |
202 | ||
203 | /** | |
204 | * __iounmap_from - Low level function to tear down the page tables | |
205 | * for an IO mapping. This is used for mappings that | |
206 | * are manipulated manually, like partial unmapping of | |
207 | * PCI IOs or ISA space. | |
208 | */ | |
209 | void __iounmap_at(void *ea, unsigned long size) | |
210 | { | |
211 | WARN_ON(((unsigned long)ea) & ~PAGE_MASK); | |
212 | WARN_ON(size & ~PAGE_MASK); | |
213 | ||
214 | unmap_kernel_range((unsigned long)ea, size); | |
14cf11af PM |
215 | } |
216 | ||
1cdab55d BH |
217 | void __iomem * __ioremap_caller(phys_addr_t addr, unsigned long size, |
218 | unsigned long flags, void *caller) | |
14cf11af | 219 | { |
3d5134ee | 220 | phys_addr_t paligned; |
14cf11af PM |
221 | void __iomem *ret; |
222 | ||
223 | /* | |
224 | * Choose an address to map it to. | |
225 | * Once the imalloc system is running, we use it. | |
226 | * Before that, we map using addresses going | |
227 | * up from ioremap_bot. imalloc will use | |
228 | * the addresses from ioremap_bot through | |
229 | * IMALLOC_END | |
230 | * | |
231 | */ | |
3d5134ee BH |
232 | paligned = addr & PAGE_MASK; |
233 | size = PAGE_ALIGN(addr + size) - paligned; | |
14cf11af | 234 | |
3d5134ee | 235 | if ((size == 0) || (paligned == 0)) |
14cf11af PM |
236 | return NULL; |
237 | ||
238 | if (mem_init_done) { | |
239 | struct vm_struct *area; | |
3d5134ee | 240 | |
1cdab55d BH |
241 | area = __get_vm_area_caller(size, VM_IOREMAP, |
242 | ioremap_bot, IOREMAP_END, | |
243 | caller); | |
14cf11af PM |
244 | if (area == NULL) |
245 | return NULL; | |
7a9d1256 ME |
246 | |
247 | area->phys_addr = paligned; | |
3d5134ee | 248 | ret = __ioremap_at(paligned, area->addr, size, flags); |
14cf11af | 249 | if (!ret) |
3d5134ee | 250 | vunmap(area->addr); |
14cf11af | 251 | } else { |
3d5134ee | 252 | ret = __ioremap_at(paligned, (void *)ioremap_bot, size, flags); |
14cf11af PM |
253 | if (ret) |
254 | ioremap_bot += size; | |
255 | } | |
3d5134ee BH |
256 | |
257 | if (ret) | |
258 | ret += addr & ~PAGE_MASK; | |
14cf11af PM |
259 | return ret; |
260 | } | |
261 | ||
1cdab55d BH |
262 | void __iomem * __ioremap(phys_addr_t addr, unsigned long size, |
263 | unsigned long flags) | |
264 | { | |
265 | return __ioremap_caller(addr, size, flags, __builtin_return_address(0)); | |
266 | } | |
4cb3cee0 | 267 | |
68a64357 | 268 | void __iomem * ioremap(phys_addr_t addr, unsigned long size) |
4cb3cee0 BH |
269 | { |
270 | unsigned long flags = _PAGE_NO_CACHE | _PAGE_GUARDED; | |
1cdab55d | 271 | void *caller = __builtin_return_address(0); |
4cb3cee0 BH |
272 | |
273 | if (ppc_md.ioremap) | |
1cdab55d BH |
274 | return ppc_md.ioremap(addr, size, flags, caller); |
275 | return __ioremap_caller(addr, size, flags, caller); | |
4cb3cee0 BH |
276 | } |
277 | ||
be135f40 AB |
278 | void __iomem * ioremap_wc(phys_addr_t addr, unsigned long size) |
279 | { | |
280 | unsigned long flags = _PAGE_NO_CACHE; | |
281 | void *caller = __builtin_return_address(0); | |
282 | ||
283 | if (ppc_md.ioremap) | |
284 | return ppc_md.ioremap(addr, size, flags, caller); | |
285 | return __ioremap_caller(addr, size, flags, caller); | |
286 | } | |
287 | ||
40f1ce7f | 288 | void __iomem * ioremap_prot(phys_addr_t addr, unsigned long size, |
4cb3cee0 BH |
289 | unsigned long flags) |
290 | { | |
1cdab55d BH |
291 | void *caller = __builtin_return_address(0); |
292 | ||
a1f242ff BH |
293 | /* writeable implies dirty for kernel addresses */ |
294 | if (flags & _PAGE_RW) | |
295 | flags |= _PAGE_DIRTY; | |
296 | ||
297 | /* we don't want to let _PAGE_USER and _PAGE_EXEC leak out */ | |
298 | flags &= ~(_PAGE_USER | _PAGE_EXEC); | |
299 | ||
55052eec BH |
300 | #ifdef _PAGE_BAP_SR |
301 | /* _PAGE_USER contains _PAGE_BAP_SR on BookE using the new PTE format | |
302 | * which means that we just cleared supervisor access... oops ;-) This | |
303 | * restores it | |
304 | */ | |
305 | flags |= _PAGE_BAP_SR; | |
306 | #endif | |
307 | ||
4cb3cee0 | 308 | if (ppc_md.ioremap) |
1cdab55d BH |
309 | return ppc_md.ioremap(addr, size, flags, caller); |
310 | return __ioremap_caller(addr, size, flags, caller); | |
4cb3cee0 BH |
311 | } |
312 | ||
313 | ||
14cf11af PM |
314 | /* |
315 | * Unmap an IO region and remove it from imalloc'd list. | |
316 | * Access to IO memory should be serialized by driver. | |
14cf11af | 317 | */ |
68a64357 | 318 | void __iounmap(volatile void __iomem *token) |
14cf11af PM |
319 | { |
320 | void *addr; | |
321 | ||
322 | if (!mem_init_done) | |
323 | return; | |
324 | ||
3d5134ee BH |
325 | addr = (void *) ((unsigned long __force) |
326 | PCI_FIX_ADDR(token) & PAGE_MASK); | |
327 | if ((unsigned long)addr < ioremap_bot) { | |
328 | printk(KERN_WARNING "Attempt to iounmap early bolted mapping" | |
329 | " at 0x%p\n", addr); | |
330 | return; | |
331 | } | |
332 | vunmap(addr); | |
14cf11af PM |
333 | } |
334 | ||
68a64357 | 335 | void iounmap(volatile void __iomem *token) |
4cb3cee0 BH |
336 | { |
337 | if (ppc_md.iounmap) | |
338 | ppc_md.iounmap(token); | |
339 | else | |
340 | __iounmap(token); | |
341 | } | |
342 | ||
14cf11af | 343 | EXPORT_SYMBOL(ioremap); |
be135f40 | 344 | EXPORT_SYMBOL(ioremap_wc); |
40f1ce7f | 345 | EXPORT_SYMBOL(ioremap_prot); |
14cf11af | 346 | EXPORT_SYMBOL(__ioremap); |
a302cb9d | 347 | EXPORT_SYMBOL(__ioremap_at); |
14cf11af | 348 | EXPORT_SYMBOL(iounmap); |
4cb3cee0 | 349 | EXPORT_SYMBOL(__iounmap); |
a302cb9d | 350 | EXPORT_SYMBOL(__iounmap_at); |
5c1f6ee9 | 351 | |
074c2eae AK |
352 | /* |
353 | * For hugepage we have pfn in the pmd, we use PTE_RPN_SHIFT bits for flags | |
354 | * For PTE page, we have a PTE_FRAG_SIZE (4K) aligned virtual address. | |
355 | */ | |
356 | struct page *pmd_page(pmd_t pmd) | |
357 | { | |
358 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | |
359 | if (pmd_trans_huge(pmd)) | |
360 | return pfn_to_page(pmd_pfn(pmd)); | |
361 | #endif | |
362 | return virt_to_page(pmd_page_vaddr(pmd)); | |
363 | } | |
364 | ||
5c1f6ee9 AK |
365 | #ifdef CONFIG_PPC_64K_PAGES |
366 | static pte_t *get_from_cache(struct mm_struct *mm) | |
367 | { | |
368 | void *pte_frag, *ret; | |
369 | ||
370 | spin_lock(&mm->page_table_lock); | |
371 | ret = mm->context.pte_frag; | |
372 | if (ret) { | |
373 | pte_frag = ret + PTE_FRAG_SIZE; | |
374 | /* | |
375 | * If we have taken up all the fragments mark PTE page NULL | |
376 | */ | |
377 | if (((unsigned long)pte_frag & ~PAGE_MASK) == 0) | |
378 | pte_frag = NULL; | |
379 | mm->context.pte_frag = pte_frag; | |
380 | } | |
381 | spin_unlock(&mm->page_table_lock); | |
382 | return (pte_t *)ret; | |
383 | } | |
384 | ||
385 | static pte_t *__alloc_for_cache(struct mm_struct *mm, int kernel) | |
386 | { | |
387 | void *ret = NULL; | |
388 | struct page *page = alloc_page(GFP_KERNEL | __GFP_NOTRACK | | |
389 | __GFP_REPEAT | __GFP_ZERO); | |
390 | if (!page) | |
391 | return NULL; | |
4f804943 KS |
392 | if (!kernel && !pgtable_page_ctor(page)) { |
393 | __free_page(page); | |
394 | return NULL; | |
395 | } | |
5c1f6ee9 AK |
396 | |
397 | ret = page_address(page); | |
398 | spin_lock(&mm->page_table_lock); | |
399 | /* | |
400 | * If we find pgtable_page set, we return | |
401 | * the allocated page with single fragement | |
402 | * count. | |
403 | */ | |
404 | if (likely(!mm->context.pte_frag)) { | |
405 | atomic_set(&page->_count, PTE_FRAG_NR); | |
406 | mm->context.pte_frag = ret + PTE_FRAG_SIZE; | |
407 | } | |
408 | spin_unlock(&mm->page_table_lock); | |
409 | ||
5c1f6ee9 AK |
410 | return (pte_t *)ret; |
411 | } | |
412 | ||
413 | pte_t *page_table_alloc(struct mm_struct *mm, unsigned long vmaddr, int kernel) | |
414 | { | |
415 | pte_t *pte; | |
416 | ||
417 | pte = get_from_cache(mm); | |
418 | if (pte) | |
419 | return pte; | |
420 | ||
421 | return __alloc_for_cache(mm, kernel); | |
422 | } | |
423 | ||
424 | void page_table_free(struct mm_struct *mm, unsigned long *table, int kernel) | |
425 | { | |
426 | struct page *page = virt_to_page(table); | |
427 | if (put_page_testzero(page)) { | |
428 | if (!kernel) | |
429 | pgtable_page_dtor(page); | |
430 | free_hot_cold_page(page, 0); | |
431 | } | |
432 | } | |
433 | ||
434 | #ifdef CONFIG_SMP | |
435 | static void page_table_free_rcu(void *table) | |
436 | { | |
437 | struct page *page = virt_to_page(table); | |
438 | if (put_page_testzero(page)) { | |
439 | pgtable_page_dtor(page); | |
440 | free_hot_cold_page(page, 0); | |
441 | } | |
442 | } | |
443 | ||
444 | void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift) | |
445 | { | |
446 | unsigned long pgf = (unsigned long)table; | |
447 | ||
448 | BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE); | |
449 | pgf |= shift; | |
450 | tlb_remove_table(tlb, (void *)pgf); | |
451 | } | |
452 | ||
453 | void __tlb_remove_table(void *_table) | |
454 | { | |
455 | void *table = (void *)((unsigned long)_table & ~MAX_PGTABLE_INDEX_SIZE); | |
456 | unsigned shift = (unsigned long)_table & MAX_PGTABLE_INDEX_SIZE; | |
457 | ||
458 | if (!shift) | |
459 | /* PTE page needs special handling */ | |
460 | page_table_free_rcu(table); | |
461 | else { | |
462 | BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE); | |
463 | kmem_cache_free(PGT_CACHE(shift), table); | |
464 | } | |
465 | } | |
466 | #else | |
467 | void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift) | |
468 | { | |
469 | if (!shift) { | |
470 | /* PTE page needs special handling */ | |
471 | struct page *page = virt_to_page(table); | |
472 | if (put_page_testzero(page)) { | |
473 | pgtable_page_dtor(page); | |
474 | free_hot_cold_page(page, 0); | |
475 | } | |
476 | } else { | |
477 | BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE); | |
478 | kmem_cache_free(PGT_CACHE(shift), table); | |
479 | } | |
480 | } | |
481 | #endif | |
482 | #endif /* CONFIG_PPC_64K_PAGES */ | |
074c2eae AK |
483 | |
484 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | |
485 | ||
486 | /* | |
487 | * This is called when relaxing access to a hugepage. It's also called in the page | |
488 | * fault path when we don't hit any of the major fault cases, ie, a minor | |
489 | * update of _PAGE_ACCESSED, _PAGE_DIRTY, etc... The generic code will have | |
490 | * handled those two for us, we additionally deal with missing execute | |
491 | * permission here on some processors | |
492 | */ | |
493 | int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, | |
494 | pmd_t *pmdp, pmd_t entry, int dirty) | |
495 | { | |
496 | int changed; | |
497 | #ifdef CONFIG_DEBUG_VM | |
498 | WARN_ON(!pmd_trans_huge(*pmdp)); | |
499 | assert_spin_locked(&vma->vm_mm->page_table_lock); | |
500 | #endif | |
501 | changed = !pmd_same(*(pmdp), entry); | |
502 | if (changed) { | |
503 | __ptep_set_access_flags(pmdp_ptep(pmdp), pmd_pte(entry)); | |
504 | /* | |
505 | * Since we are not supporting SW TLB systems, we don't | |
506 | * have any thing similar to flush_tlb_page_nohash() | |
507 | */ | |
508 | } | |
509 | return changed; | |
510 | } | |
511 | ||
512 | unsigned long pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, | |
88247e8d AK |
513 | pmd_t *pmdp, unsigned long clr, |
514 | unsigned long set) | |
074c2eae AK |
515 | { |
516 | ||
517 | unsigned long old, tmp; | |
518 | ||
519 | #ifdef CONFIG_DEBUG_VM | |
520 | WARN_ON(!pmd_trans_huge(*pmdp)); | |
521 | assert_spin_locked(&mm->page_table_lock); | |
522 | #endif | |
523 | ||
524 | #ifdef PTE_ATOMIC_UPDATES | |
525 | __asm__ __volatile__( | |
526 | "1: ldarx %0,0,%3\n\ | |
527 | andi. %1,%0,%6\n\ | |
528 | bne- 1b \n\ | |
529 | andc %1,%0,%4 \n\ | |
88247e8d | 530 | or %1,%1,%7\n\ |
074c2eae AK |
531 | stdcx. %1,0,%3 \n\ |
532 | bne- 1b" | |
533 | : "=&r" (old), "=&r" (tmp), "=m" (*pmdp) | |
88247e8d | 534 | : "r" (pmdp), "r" (clr), "m" (*pmdp), "i" (_PAGE_BUSY), "r" (set) |
074c2eae AK |
535 | : "cc" ); |
536 | #else | |
537 | old = pmd_val(*pmdp); | |
88247e8d | 538 | *pmdp = __pmd((old & ~clr) | set); |
074c2eae AK |
539 | #endif |
540 | if (old & _PAGE_HASHPTE) | |
fc047955 | 541 | hpte_do_hugepage_flush(mm, addr, pmdp, old); |
074c2eae AK |
542 | return old; |
543 | } | |
544 | ||
545 | pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address, | |
546 | pmd_t *pmdp) | |
547 | { | |
548 | pmd_t pmd; | |
549 | ||
550 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | |
551 | if (pmd_trans_huge(*pmdp)) { | |
552 | pmd = pmdp_get_and_clear(vma->vm_mm, address, pmdp); | |
553 | } else { | |
554 | /* | |
555 | * khugepaged calls this for normal pmd | |
556 | */ | |
557 | pmd = *pmdp; | |
558 | pmd_clear(pmdp); | |
559 | /* | |
560 | * Wait for all pending hash_page to finish. This is needed | |
561 | * in case of subpage collapse. When we collapse normal pages | |
562 | * to hugepage, we first clear the pmd, then invalidate all | |
563 | * the PTE entries. The assumption here is that any low level | |
564 | * page fault will see a none pmd and take the slow path that | |
565 | * will wait on mmap_sem. But we could very well be in a | |
566 | * hash_page with local ptep pointer value. Such a hash page | |
567 | * can result in adding new HPTE entries for normal subpages. | |
568 | * That means we could be modifying the page content as we | |
569 | * copy them to a huge page. So wait for parallel hash_page | |
570 | * to finish before invalidating HPTE entries. We can do this | |
571 | * by sending an IPI to all the cpus and executing a dummy | |
572 | * function there. | |
573 | */ | |
574 | kick_all_cpus_sync(); | |
575 | /* | |
576 | * Now invalidate the hpte entries in the range | |
577 | * covered by pmd. This make sure we take a | |
578 | * fault and will find the pmd as none, which will | |
579 | * result in a major fault which takes mmap_sem and | |
580 | * hence wait for collapse to complete. Without this | |
581 | * the __collapse_huge_page_copy can result in copying | |
582 | * the old content. | |
583 | */ | |
584 | flush_tlb_pmd_range(vma->vm_mm, &pmd, address); | |
585 | } | |
586 | return pmd; | |
587 | } | |
588 | ||
589 | int pmdp_test_and_clear_young(struct vm_area_struct *vma, | |
590 | unsigned long address, pmd_t *pmdp) | |
591 | { | |
592 | return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp); | |
593 | } | |
594 | ||
595 | /* | |
596 | * We currently remove entries from the hashtable regardless of whether | |
597 | * the entry was young or dirty. The generic routines only flush if the | |
598 | * entry was young or dirty which is not good enough. | |
599 | * | |
600 | * We should be more intelligent about this but for the moment we override | |
601 | * these functions and force a tlb flush unconditionally | |
602 | */ | |
603 | int pmdp_clear_flush_young(struct vm_area_struct *vma, | |
604 | unsigned long address, pmd_t *pmdp) | |
605 | { | |
606 | return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp); | |
607 | } | |
608 | ||
609 | /* | |
610 | * We mark the pmd splitting and invalidate all the hpte | |
611 | * entries for this hugepage. | |
612 | */ | |
613 | void pmdp_splitting_flush(struct vm_area_struct *vma, | |
614 | unsigned long address, pmd_t *pmdp) | |
615 | { | |
616 | unsigned long old, tmp; | |
617 | ||
618 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | |
619 | ||
620 | #ifdef CONFIG_DEBUG_VM | |
621 | WARN_ON(!pmd_trans_huge(*pmdp)); | |
622 | assert_spin_locked(&vma->vm_mm->page_table_lock); | |
623 | #endif | |
624 | ||
625 | #ifdef PTE_ATOMIC_UPDATES | |
626 | ||
627 | __asm__ __volatile__( | |
628 | "1: ldarx %0,0,%3\n\ | |
629 | andi. %1,%0,%6\n\ | |
630 | bne- 1b \n\ | |
631 | ori %1,%0,%4 \n\ | |
632 | stdcx. %1,0,%3 \n\ | |
633 | bne- 1b" | |
634 | : "=&r" (old), "=&r" (tmp), "=m" (*pmdp) | |
635 | : "r" (pmdp), "i" (_PAGE_SPLITTING), "m" (*pmdp), "i" (_PAGE_BUSY) | |
636 | : "cc" ); | |
637 | #else | |
638 | old = pmd_val(*pmdp); | |
639 | *pmdp = __pmd(old | _PAGE_SPLITTING); | |
640 | #endif | |
641 | /* | |
642 | * If we didn't had the splitting flag set, go and flush the | |
643 | * HPTE entries. | |
644 | */ | |
645 | if (!(old & _PAGE_SPLITTING)) { | |
646 | /* We need to flush the hpte */ | |
647 | if (old & _PAGE_HASHPTE) | |
fc047955 | 648 | hpte_do_hugepage_flush(vma->vm_mm, address, pmdp, old); |
074c2eae | 649 | } |
346519a1 AK |
650 | /* |
651 | * This ensures that generic code that rely on IRQ disabling | |
652 | * to prevent a parallel THP split work as expected. | |
653 | */ | |
654 | kick_all_cpus_sync(); | |
074c2eae AK |
655 | } |
656 | ||
657 | /* | |
658 | * We want to put the pgtable in pmd and use pgtable for tracking | |
659 | * the base page size hptes | |
660 | */ | |
661 | void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, | |
662 | pgtable_t pgtable) | |
663 | { | |
664 | pgtable_t *pgtable_slot; | |
665 | assert_spin_locked(&mm->page_table_lock); | |
666 | /* | |
667 | * we store the pgtable in the second half of PMD | |
668 | */ | |
669 | pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD; | |
670 | *pgtable_slot = pgtable; | |
671 | /* | |
672 | * expose the deposited pgtable to other cpus. | |
673 | * before we set the hugepage PTE at pmd level | |
674 | * hash fault code looks at the deposted pgtable | |
675 | * to store hash index values. | |
676 | */ | |
677 | smp_wmb(); | |
678 | } | |
679 | ||
680 | pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) | |
681 | { | |
682 | pgtable_t pgtable; | |
683 | pgtable_t *pgtable_slot; | |
684 | ||
685 | assert_spin_locked(&mm->page_table_lock); | |
686 | pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD; | |
687 | pgtable = *pgtable_slot; | |
688 | /* | |
689 | * Once we withdraw, mark the entry NULL. | |
690 | */ | |
691 | *pgtable_slot = NULL; | |
692 | /* | |
693 | * We store HPTE information in the deposited PTE fragment. | |
694 | * zero out the content on withdraw. | |
695 | */ | |
696 | memset(pgtable, 0, PTE_FRAG_SIZE); | |
697 | return pgtable; | |
698 | } | |
699 | ||
700 | /* | |
701 | * set a new huge pmd. We should not be called for updating | |
702 | * an existing pmd entry. That should go via pmd_hugepage_update. | |
703 | */ | |
704 | void set_pmd_at(struct mm_struct *mm, unsigned long addr, | |
705 | pmd_t *pmdp, pmd_t pmd) | |
706 | { | |
707 | #ifdef CONFIG_DEBUG_VM | |
8937ba48 | 708 | WARN_ON(pmd_val(*pmdp) & _PAGE_PRESENT); |
074c2eae AK |
709 | assert_spin_locked(&mm->page_table_lock); |
710 | WARN_ON(!pmd_trans_huge(pmd)); | |
711 | #endif | |
712 | return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd)); | |
713 | } | |
714 | ||
715 | void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, | |
716 | pmd_t *pmdp) | |
717 | { | |
88247e8d | 718 | pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, 0); |
074c2eae AK |
719 | } |
720 | ||
721 | /* | |
722 | * A linux hugepage PMD was changed and the corresponding hash table entries | |
723 | * neesd to be flushed. | |
724 | */ | |
725 | void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr, | |
fc047955 | 726 | pmd_t *pmdp, unsigned long old_pmd) |
074c2eae AK |
727 | { |
728 | int ssize, i; | |
729 | unsigned long s_addr; | |
1a527286 | 730 | int max_hpte_count; |
074c2eae AK |
731 | unsigned int psize, valid; |
732 | unsigned char *hpte_slot_array; | |
733 | unsigned long hidx, vpn, vsid, hash, shift, slot; | |
734 | ||
735 | /* | |
736 | * Flush all the hptes mapping this hugepage | |
737 | */ | |
738 | s_addr = addr & HPAGE_PMD_MASK; | |
739 | hpte_slot_array = get_hpte_slot_array(pmdp); | |
740 | /* | |
741 | * IF we try to do a HUGE PTE update after a withdraw is done. | |
742 | * we will find the below NULL. This happens when we do | |
743 | * split_huge_page_pmd | |
744 | */ | |
745 | if (!hpte_slot_array) | |
746 | return; | |
747 | ||
fa1f8ae8 | 748 | /* get the base page size,vsid and segment size */ |
fc047955 | 749 | #ifdef CONFIG_DEBUG_VM |
074c2eae | 750 | psize = get_slice_psize(mm, s_addr); |
fc047955 AK |
751 | BUG_ON(psize == MMU_PAGE_16M); |
752 | #endif | |
753 | if (old_pmd & _PAGE_COMBO) | |
754 | psize = MMU_PAGE_4K; | |
755 | else | |
756 | psize = MMU_PAGE_64K; | |
757 | ||
fa1f8ae8 AK |
758 | if (!is_kernel_addr(s_addr)) { |
759 | ssize = user_segment_size(s_addr); | |
760 | vsid = get_vsid(mm->context.id, s_addr, ssize); | |
761 | WARN_ON(vsid == 0); | |
762 | } else { | |
763 | vsid = get_kernel_vsid(s_addr, mmu_kernel_ssize); | |
764 | ssize = mmu_kernel_ssize; | |
765 | } | |
074c2eae | 766 | |
1a527286 | 767 | if (ppc_md.hugepage_invalidate) |
fa1f8ae8 AK |
768 | return ppc_md.hugepage_invalidate(vsid, s_addr, |
769 | hpte_slot_array, | |
770 | psize, ssize); | |
1a527286 AK |
771 | /* |
772 | * No bluk hpte removal support, invalidate each entry | |
773 | */ | |
774 | shift = mmu_psize_defs[psize].shift; | |
775 | max_hpte_count = HPAGE_PMD_SIZE >> shift; | |
776 | for (i = 0; i < max_hpte_count; i++) { | |
074c2eae AK |
777 | /* |
778 | * 8 bits per each hpte entries | |
779 | * 000| [ secondary group (one bit) | hidx (3 bits) | valid bit] | |
780 | */ | |
781 | valid = hpte_valid(hpte_slot_array, i); | |
782 | if (!valid) | |
783 | continue; | |
784 | hidx = hpte_hash_index(hpte_slot_array, i); | |
785 | ||
786 | /* get the vpn */ | |
787 | addr = s_addr + (i * (1ul << shift)); | |
074c2eae AK |
788 | vpn = hpt_vpn(addr, vsid, ssize); |
789 | hash = hpt_hash(vpn, shift, ssize); | |
790 | if (hidx & _PTEIDX_SECONDARY) | |
791 | hash = ~hash; | |
792 | ||
793 | slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; | |
794 | slot += hidx & _PTEIDX_GROUP_IX; | |
795 | ppc_md.hpte_invalidate(slot, vpn, psize, | |
796 | MMU_PAGE_16M, ssize, 0); | |
797 | } | |
798 | } | |
799 | ||
800 | static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot) | |
801 | { | |
802 | pmd_val(pmd) |= pgprot_val(pgprot); | |
803 | return pmd; | |
804 | } | |
805 | ||
806 | pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot) | |
807 | { | |
808 | pmd_t pmd; | |
809 | /* | |
810 | * For a valid pte, we would have _PAGE_PRESENT or _PAGE_FILE always | |
811 | * set. We use this to check THP page at pmd level. | |
812 | * leaf pte for huge page, bottom two bits != 00 | |
813 | */ | |
814 | pmd_val(pmd) = pfn << PTE_RPN_SHIFT; | |
815 | pmd_val(pmd) |= _PAGE_THP_HUGE; | |
816 | pmd = pmd_set_protbits(pmd, pgprot); | |
817 | return pmd; | |
818 | } | |
819 | ||
820 | pmd_t mk_pmd(struct page *page, pgprot_t pgprot) | |
821 | { | |
822 | return pfn_pmd(page_to_pfn(page), pgprot); | |
823 | } | |
824 | ||
825 | pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) | |
826 | { | |
827 | ||
828 | pmd_val(pmd) &= _HPAGE_CHG_MASK; | |
829 | pmd = pmd_set_protbits(pmd, newprot); | |
830 | return pmd; | |
831 | } | |
832 | ||
833 | /* | |
834 | * This is called at the end of handling a user page fault, when the | |
835 | * fault has been handled by updating a HUGE PMD entry in the linux page tables. | |
836 | * We use it to preload an HPTE into the hash table corresponding to | |
837 | * the updated linux HUGE PMD entry. | |
838 | */ | |
839 | void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, | |
840 | pmd_t *pmd) | |
841 | { | |
842 | return; | |
843 | } | |
844 | ||
845 | pmd_t pmdp_get_and_clear(struct mm_struct *mm, | |
846 | unsigned long addr, pmd_t *pmdp) | |
847 | { | |
848 | pmd_t old_pmd; | |
849 | pgtable_t pgtable; | |
850 | unsigned long old; | |
851 | pgtable_t *pgtable_slot; | |
852 | ||
88247e8d | 853 | old = pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0); |
074c2eae AK |
854 | old_pmd = __pmd(old); |
855 | /* | |
856 | * We have pmd == none and we are holding page_table_lock. | |
857 | * So we can safely go and clear the pgtable hash | |
858 | * index info. | |
859 | */ | |
860 | pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD; | |
861 | pgtable = *pgtable_slot; | |
862 | /* | |
863 | * Let's zero out old valid and hash index details | |
864 | * hash fault look at them. | |
865 | */ | |
866 | memset(pgtable, 0, PTE_FRAG_SIZE); | |
867 | return old_pmd; | |
868 | } | |
437d4964 AK |
869 | |
870 | int has_transparent_hugepage(void) | |
871 | { | |
872 | if (!mmu_has_feature(MMU_FTR_16M_PAGE)) | |
873 | return 0; | |
874 | /* | |
875 | * We support THP only if PMD_SIZE is 16MB. | |
876 | */ | |
877 | if (mmu_psize_defs[MMU_PAGE_16M].shift != PMD_SHIFT) | |
878 | return 0; | |
879 | /* | |
880 | * We need to make sure that we support 16MB hugepage in a segement | |
881 | * with base page size 64K or 4K. We only enable THP with a PAGE_SIZE | |
882 | * of 64K. | |
883 | */ | |
884 | /* | |
885 | * If we have 64K HPTE, we will be using that by default | |
886 | */ | |
887 | if (mmu_psize_defs[MMU_PAGE_64K].shift && | |
888 | (mmu_psize_defs[MMU_PAGE_64K].penc[MMU_PAGE_16M] == -1)) | |
889 | return 0; | |
890 | /* | |
891 | * Ok we only have 4K HPTE | |
892 | */ | |
893 | if (mmu_psize_defs[MMU_PAGE_4K].penc[MMU_PAGE_16M] == -1) | |
894 | return 0; | |
895 | ||
896 | return 1; | |
897 | } | |
074c2eae | 898 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ |