Commit | Line | Data |
---|---|---|
1da177e4 LT |
1 | /* |
2 | * PPC64 (POWER4) Huge TLB Page Support for Kernel. | |
3 | * | |
4 | * Copyright (C) 2003 David Gibson, IBM Corporation. | |
5 | * | |
6 | * Based on the IA-32 version: | |
7 | * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com> | |
8 | */ | |
9 | ||
10 | #include <linux/init.h> | |
11 | #include <linux/fs.h> | |
12 | #include <linux/mm.h> | |
13 | #include <linux/hugetlb.h> | |
14 | #include <linux/pagemap.h> | |
15 | #include <linux/smp_lock.h> | |
16 | #include <linux/slab.h> | |
17 | #include <linux/err.h> | |
18 | #include <linux/sysctl.h> | |
19 | #include <asm/mman.h> | |
20 | #include <asm/pgalloc.h> | |
21 | #include <asm/tlb.h> | |
22 | #include <asm/tlbflush.h> | |
23 | #include <asm/mmu_context.h> | |
24 | #include <asm/machdep.h> | |
25 | #include <asm/cputable.h> | |
26 | #include <asm/tlb.h> | |
27 | ||
28 | #include <linux/sysctl.h> | |
29 | ||
30 | #define HUGEPGDIR_SHIFT (HPAGE_SHIFT + PAGE_SHIFT - 3) | |
31 | #define HUGEPGDIR_SIZE (1UL << HUGEPGDIR_SHIFT) | |
32 | #define HUGEPGDIR_MASK (~(HUGEPGDIR_SIZE-1)) | |
33 | ||
34 | #define HUGEPTE_INDEX_SIZE 9 | |
35 | #define HUGEPGD_INDEX_SIZE 10 | |
36 | ||
37 | #define PTRS_PER_HUGEPTE (1 << HUGEPTE_INDEX_SIZE) | |
38 | #define PTRS_PER_HUGEPGD (1 << HUGEPGD_INDEX_SIZE) | |
39 | ||
40 | static inline int hugepgd_index(unsigned long addr) | |
41 | { | |
42 | return (addr & ~REGION_MASK) >> HUGEPGDIR_SHIFT; | |
43 | } | |
44 | ||
58366af5 | 45 | static pud_t *hugepgd_offset(struct mm_struct *mm, unsigned long addr) |
1da177e4 LT |
46 | { |
47 | int index; | |
48 | ||
49 | if (! mm->context.huge_pgdir) | |
50 | return NULL; | |
51 | ||
52 | ||
53 | index = hugepgd_index(addr); | |
54 | BUG_ON(index >= PTRS_PER_HUGEPGD); | |
58366af5 | 55 | return (pud_t *)(mm->context.huge_pgdir + index); |
1da177e4 LT |
56 | } |
57 | ||
58366af5 | 58 | static inline pte_t *hugepte_offset(pud_t *dir, unsigned long addr) |
1da177e4 LT |
59 | { |
60 | int index; | |
61 | ||
58366af5 | 62 | if (pud_none(*dir)) |
1da177e4 LT |
63 | return NULL; |
64 | ||
65 | index = (addr >> HPAGE_SHIFT) % PTRS_PER_HUGEPTE; | |
58366af5 | 66 | return (pte_t *)pud_page(*dir) + index; |
1da177e4 LT |
67 | } |
68 | ||
58366af5 | 69 | static pud_t *hugepgd_alloc(struct mm_struct *mm, unsigned long addr) |
1da177e4 LT |
70 | { |
71 | BUG_ON(! in_hugepage_area(mm->context, addr)); | |
72 | ||
73 | if (! mm->context.huge_pgdir) { | |
74 | pgd_t *new; | |
75 | spin_unlock(&mm->page_table_lock); | |
76 | /* Don't use pgd_alloc(), because we want __GFP_REPEAT */ | |
77 | new = kmem_cache_alloc(zero_cache, GFP_KERNEL | __GFP_REPEAT); | |
78 | BUG_ON(memcmp(new, empty_zero_page, PAGE_SIZE)); | |
79 | spin_lock(&mm->page_table_lock); | |
80 | ||
81 | /* | |
82 | * Because we dropped the lock, we should re-check the | |
83 | * entry, as somebody else could have populated it.. | |
84 | */ | |
85 | if (mm->context.huge_pgdir) | |
86 | pgd_free(new); | |
87 | else | |
88 | mm->context.huge_pgdir = new; | |
89 | } | |
90 | return hugepgd_offset(mm, addr); | |
91 | } | |
92 | ||
58366af5 | 93 | static pte_t *hugepte_alloc(struct mm_struct *mm, pud_t *dir, unsigned long addr) |
1da177e4 | 94 | { |
58366af5 | 95 | if (! pud_present(*dir)) { |
1da177e4 LT |
96 | pte_t *new; |
97 | ||
98 | spin_unlock(&mm->page_table_lock); | |
99 | new = kmem_cache_alloc(zero_cache, GFP_KERNEL | __GFP_REPEAT); | |
100 | BUG_ON(memcmp(new, empty_zero_page, PAGE_SIZE)); | |
101 | spin_lock(&mm->page_table_lock); | |
102 | /* | |
103 | * Because we dropped the lock, we should re-check the | |
104 | * entry, as somebody else could have populated it.. | |
105 | */ | |
58366af5 | 106 | if (pud_present(*dir)) { |
1da177e4 LT |
107 | if (new) |
108 | kmem_cache_free(zero_cache, new); | |
109 | } else { | |
110 | struct page *ptepage; | |
111 | ||
112 | if (! new) | |
113 | return NULL; | |
114 | ptepage = virt_to_page(new); | |
115 | ptepage->mapping = (void *) mm; | |
116 | ptepage->index = addr & HUGEPGDIR_MASK; | |
58366af5 | 117 | pud_populate(mm, dir, new); |
1da177e4 LT |
118 | } |
119 | } | |
120 | ||
121 | return hugepte_offset(dir, addr); | |
122 | } | |
123 | ||
63551ae0 | 124 | pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) |
1da177e4 | 125 | { |
58366af5 | 126 | pud_t *pud; |
1da177e4 LT |
127 | |
128 | BUG_ON(! in_hugepage_area(mm->context, addr)); | |
129 | ||
58366af5 BH |
130 | pud = hugepgd_offset(mm, addr); |
131 | if (! pud) | |
1da177e4 LT |
132 | return NULL; |
133 | ||
58366af5 | 134 | return hugepte_offset(pud, addr); |
1da177e4 LT |
135 | } |
136 | ||
63551ae0 | 137 | pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr) |
1da177e4 | 138 | { |
58366af5 | 139 | pud_t *pud; |
1da177e4 LT |
140 | |
141 | BUG_ON(! in_hugepage_area(mm->context, addr)); | |
142 | ||
58366af5 BH |
143 | pud = hugepgd_alloc(mm, addr); |
144 | if (! pud) | |
1da177e4 LT |
145 | return NULL; |
146 | ||
58366af5 | 147 | return hugepte_alloc(mm, pud, addr); |
1da177e4 LT |
148 | } |
149 | ||
1da177e4 LT |
150 | /* |
151 | * This function checks for proper alignment of input addr and len parameters. | |
152 | */ | |
153 | int is_aligned_hugepage_range(unsigned long addr, unsigned long len) | |
154 | { | |
155 | if (len & ~HPAGE_MASK) | |
156 | return -EINVAL; | |
157 | if (addr & ~HPAGE_MASK) | |
158 | return -EINVAL; | |
159 | if (! (within_hugepage_low_range(addr, len) | |
160 | || within_hugepage_high_range(addr, len)) ) | |
161 | return -EINVAL; | |
162 | return 0; | |
163 | } | |
164 | ||
165 | static void flush_segments(void *parm) | |
166 | { | |
167 | u16 segs = (unsigned long) parm; | |
168 | unsigned long i; | |
169 | ||
170 | asm volatile("isync" : : : "memory"); | |
171 | ||
172 | for (i = 0; i < 16; i++) { | |
173 | if (! (segs & (1U << i))) | |
174 | continue; | |
175 | asm volatile("slbie %0" : : "r" (i << SID_SHIFT)); | |
176 | } | |
177 | ||
178 | asm volatile("isync" : : : "memory"); | |
179 | } | |
180 | ||
181 | static int prepare_low_seg_for_htlb(struct mm_struct *mm, unsigned long seg) | |
182 | { | |
183 | unsigned long start = seg << SID_SHIFT; | |
184 | unsigned long end = (seg+1) << SID_SHIFT; | |
185 | struct vm_area_struct *vma; | |
1da177e4 LT |
186 | |
187 | BUG_ON(seg >= 16); | |
188 | ||
189 | /* Check no VMAs are in the region */ | |
190 | vma = find_vma(mm, start); | |
191 | if (vma && (vma->vm_start < end)) | |
192 | return -EBUSY; | |
193 | ||
1da177e4 LT |
194 | return 0; |
195 | } | |
196 | ||
197 | static int open_low_hpage_segs(struct mm_struct *mm, u16 newsegs) | |
198 | { | |
199 | unsigned long i; | |
200 | ||
201 | newsegs &= ~(mm->context.htlb_segs); | |
202 | if (! newsegs) | |
203 | return 0; /* The segments we want are already open */ | |
204 | ||
205 | for (i = 0; i < 16; i++) | |
206 | if ((1 << i) & newsegs) | |
207 | if (prepare_low_seg_for_htlb(mm, i) != 0) | |
208 | return -EBUSY; | |
209 | ||
210 | mm->context.htlb_segs |= newsegs; | |
211 | ||
212 | /* update the paca copy of the context struct */ | |
213 | get_paca()->context = mm->context; | |
214 | ||
215 | /* the context change must make it to memory before the flush, | |
216 | * so that further SLB misses do the right thing. */ | |
217 | mb(); | |
218 | on_each_cpu(flush_segments, (void *)(unsigned long)newsegs, 0, 1); | |
219 | ||
220 | return 0; | |
221 | } | |
222 | ||
223 | int prepare_hugepage_range(unsigned long addr, unsigned long len) | |
224 | { | |
225 | if (within_hugepage_high_range(addr, len)) | |
226 | return 0; | |
227 | else if ((addr < 0x100000000UL) && ((addr+len) < 0x100000000UL)) { | |
228 | int err; | |
229 | /* Yes, we need both tests, in case addr+len overflows | |
230 | * 64-bit arithmetic */ | |
231 | err = open_low_hpage_segs(current->mm, | |
232 | LOW_ESID_MASK(addr, len)); | |
233 | if (err) | |
234 | printk(KERN_DEBUG "prepare_hugepage_range(%lx, %lx)" | |
235 | " failed (segs: 0x%04hx)\n", addr, len, | |
236 | LOW_ESID_MASK(addr, len)); | |
237 | return err; | |
238 | } | |
239 | ||
240 | return -EINVAL; | |
241 | } | |
242 | ||
1da177e4 LT |
243 | struct page * |
244 | follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) | |
245 | { | |
246 | pte_t *ptep; | |
247 | struct page *page; | |
248 | ||
249 | if (! in_hugepage_area(mm->context, address)) | |
250 | return ERR_PTR(-EINVAL); | |
251 | ||
252 | ptep = huge_pte_offset(mm, address); | |
253 | page = pte_page(*ptep); | |
254 | if (page) | |
255 | page += (address % HPAGE_SIZE) / PAGE_SIZE; | |
256 | ||
257 | return page; | |
258 | } | |
259 | ||
260 | int pmd_huge(pmd_t pmd) | |
261 | { | |
262 | return 0; | |
263 | } | |
264 | ||
265 | struct page * | |
266 | follow_huge_pmd(struct mm_struct *mm, unsigned long address, | |
267 | pmd_t *pmd, int write) | |
268 | { | |
269 | BUG(); | |
270 | return NULL; | |
271 | } | |
272 | ||
1da177e4 LT |
273 | /* Because we have an exclusive hugepage region which lies within the |
274 | * normal user address space, we have to take special measures to make | |
275 | * non-huge mmap()s evade the hugepage reserved regions. */ | |
276 | unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, | |
277 | unsigned long len, unsigned long pgoff, | |
278 | unsigned long flags) | |
279 | { | |
280 | struct mm_struct *mm = current->mm; | |
281 | struct vm_area_struct *vma; | |
282 | unsigned long start_addr; | |
283 | ||
284 | if (len > TASK_SIZE) | |
285 | return -ENOMEM; | |
286 | ||
287 | if (addr) { | |
288 | addr = PAGE_ALIGN(addr); | |
289 | vma = find_vma(mm, addr); | |
290 | if (((TASK_SIZE - len) >= addr) | |
291 | && (!vma || (addr+len) <= vma->vm_start) | |
292 | && !is_hugepage_only_range(mm, addr,len)) | |
293 | return addr; | |
294 | } | |
295 | start_addr = addr = mm->free_area_cache; | |
296 | ||
297 | full_search: | |
298 | vma = find_vma(mm, addr); | |
299 | while (TASK_SIZE - len >= addr) { | |
300 | BUG_ON(vma && (addr >= vma->vm_end)); | |
301 | ||
302 | if (touches_hugepage_low_range(mm, addr, len)) { | |
303 | addr = ALIGN(addr+1, 1<<SID_SHIFT); | |
304 | vma = find_vma(mm, addr); | |
305 | continue; | |
306 | } | |
307 | if (touches_hugepage_high_range(addr, len)) { | |
308 | addr = TASK_HPAGE_END; | |
309 | vma = find_vma(mm, addr); | |
310 | continue; | |
311 | } | |
312 | if (!vma || addr + len <= vma->vm_start) { | |
313 | /* | |
314 | * Remember the place where we stopped the search: | |
315 | */ | |
316 | mm->free_area_cache = addr + len; | |
317 | return addr; | |
318 | } | |
319 | addr = vma->vm_end; | |
320 | vma = vma->vm_next; | |
321 | } | |
322 | ||
323 | /* Make sure we didn't miss any holes */ | |
324 | if (start_addr != TASK_UNMAPPED_BASE) { | |
325 | start_addr = addr = TASK_UNMAPPED_BASE; | |
326 | goto full_search; | |
327 | } | |
328 | return -ENOMEM; | |
329 | } | |
330 | ||
331 | /* | |
332 | * This mmap-allocator allocates new areas top-down from below the | |
333 | * stack's low limit (the base): | |
334 | * | |
335 | * Because we have an exclusive hugepage region which lies within the | |
336 | * normal user address space, we have to take special measures to make | |
337 | * non-huge mmap()s evade the hugepage reserved regions. | |
338 | */ | |
339 | unsigned long | |
340 | arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, | |
341 | const unsigned long len, const unsigned long pgoff, | |
342 | const unsigned long flags) | |
343 | { | |
344 | struct vm_area_struct *vma, *prev_vma; | |
345 | struct mm_struct *mm = current->mm; | |
346 | unsigned long base = mm->mmap_base, addr = addr0; | |
347 | int first_time = 1; | |
348 | ||
349 | /* requested length too big for entire address space */ | |
350 | if (len > TASK_SIZE) | |
351 | return -ENOMEM; | |
352 | ||
353 | /* dont allow allocations above current base */ | |
354 | if (mm->free_area_cache > base) | |
355 | mm->free_area_cache = base; | |
356 | ||
357 | /* requesting a specific address */ | |
358 | if (addr) { | |
359 | addr = PAGE_ALIGN(addr); | |
360 | vma = find_vma(mm, addr); | |
361 | if (TASK_SIZE - len >= addr && | |
362 | (!vma || addr + len <= vma->vm_start) | |
363 | && !is_hugepage_only_range(mm, addr,len)) | |
364 | return addr; | |
365 | } | |
366 | ||
367 | try_again: | |
368 | /* make sure it can fit in the remaining address space */ | |
369 | if (mm->free_area_cache < len) | |
370 | goto fail; | |
371 | ||
372 | /* either no address requested or cant fit in requested address hole */ | |
373 | addr = (mm->free_area_cache - len) & PAGE_MASK; | |
374 | do { | |
375 | hugepage_recheck: | |
376 | if (touches_hugepage_low_range(mm, addr, len)) { | |
377 | addr = (addr & ((~0) << SID_SHIFT)) - len; | |
378 | goto hugepage_recheck; | |
379 | } else if (touches_hugepage_high_range(addr, len)) { | |
380 | addr = TASK_HPAGE_BASE - len; | |
381 | } | |
382 | ||
383 | /* | |
384 | * Lookup failure means no vma is above this address, | |
385 | * i.e. return with success: | |
386 | */ | |
387 | if (!(vma = find_vma_prev(mm, addr, &prev_vma))) | |
388 | return addr; | |
389 | ||
390 | /* | |
391 | * new region fits between prev_vma->vm_end and | |
392 | * vma->vm_start, use it: | |
393 | */ | |
394 | if (addr+len <= vma->vm_start && | |
395 | (!prev_vma || (addr >= prev_vma->vm_end))) | |
396 | /* remember the address as a hint for next time */ | |
397 | return (mm->free_area_cache = addr); | |
398 | else | |
399 | /* pull free_area_cache down to the first hole */ | |
400 | if (mm->free_area_cache == vma->vm_end) | |
401 | mm->free_area_cache = vma->vm_start; | |
402 | ||
403 | /* try just below the current vma->vm_start */ | |
404 | addr = vma->vm_start-len; | |
405 | } while (len <= vma->vm_start); | |
406 | ||
407 | fail: | |
408 | /* | |
409 | * if hint left us with no space for the requested | |
410 | * mapping then try again: | |
411 | */ | |
412 | if (first_time) { | |
413 | mm->free_area_cache = base; | |
414 | first_time = 0; | |
415 | goto try_again; | |
416 | } | |
417 | /* | |
418 | * A failed mmap() very likely causes application failure, | |
419 | * so fall back to the bottom-up function here. This scenario | |
420 | * can happen with large stack limits and large mmap() | |
421 | * allocations. | |
422 | */ | |
423 | mm->free_area_cache = TASK_UNMAPPED_BASE; | |
424 | addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags); | |
425 | /* | |
426 | * Restore the topdown base: | |
427 | */ | |
428 | mm->free_area_cache = base; | |
429 | ||
430 | return addr; | |
431 | } | |
432 | ||
433 | static unsigned long htlb_get_low_area(unsigned long len, u16 segmask) | |
434 | { | |
435 | unsigned long addr = 0; | |
436 | struct vm_area_struct *vma; | |
437 | ||
438 | vma = find_vma(current->mm, addr); | |
439 | while (addr + len <= 0x100000000UL) { | |
440 | BUG_ON(vma && (addr >= vma->vm_end)); /* invariant */ | |
441 | ||
442 | if (! __within_hugepage_low_range(addr, len, segmask)) { | |
443 | addr = ALIGN(addr+1, 1<<SID_SHIFT); | |
444 | vma = find_vma(current->mm, addr); | |
445 | continue; | |
446 | } | |
447 | ||
448 | if (!vma || (addr + len) <= vma->vm_start) | |
449 | return addr; | |
450 | addr = ALIGN(vma->vm_end, HPAGE_SIZE); | |
451 | /* Depending on segmask this might not be a confirmed | |
452 | * hugepage region, so the ALIGN could have skipped | |
453 | * some VMAs */ | |
454 | vma = find_vma(current->mm, addr); | |
455 | } | |
456 | ||
457 | return -ENOMEM; | |
458 | } | |
459 | ||
460 | static unsigned long htlb_get_high_area(unsigned long len) | |
461 | { | |
462 | unsigned long addr = TASK_HPAGE_BASE; | |
463 | struct vm_area_struct *vma; | |
464 | ||
465 | vma = find_vma(current->mm, addr); | |
466 | for (vma = find_vma(current->mm, addr); | |
467 | addr + len <= TASK_HPAGE_END; | |
468 | vma = vma->vm_next) { | |
469 | BUG_ON(vma && (addr >= vma->vm_end)); /* invariant */ | |
470 | BUG_ON(! within_hugepage_high_range(addr, len)); | |
471 | ||
472 | if (!vma || (addr + len) <= vma->vm_start) | |
473 | return addr; | |
474 | addr = ALIGN(vma->vm_end, HPAGE_SIZE); | |
475 | /* Because we're in a hugepage region, this alignment | |
476 | * should not skip us over any VMAs */ | |
477 | } | |
478 | ||
479 | return -ENOMEM; | |
480 | } | |
481 | ||
482 | unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, | |
483 | unsigned long len, unsigned long pgoff, | |
484 | unsigned long flags) | |
485 | { | |
486 | if (len & ~HPAGE_MASK) | |
487 | return -EINVAL; | |
488 | ||
489 | if (!cpu_has_feature(CPU_FTR_16M_PAGE)) | |
490 | return -EINVAL; | |
491 | ||
492 | if (test_thread_flag(TIF_32BIT)) { | |
493 | int lastshift = 0; | |
494 | u16 segmask, cursegs = current->mm->context.htlb_segs; | |
495 | ||
496 | /* First see if we can do the mapping in the existing | |
497 | * low hpage segments */ | |
498 | addr = htlb_get_low_area(len, cursegs); | |
499 | if (addr != -ENOMEM) | |
500 | return addr; | |
501 | ||
502 | for (segmask = LOW_ESID_MASK(0x100000000UL-len, len); | |
503 | ! lastshift; segmask >>=1) { | |
504 | if (segmask & 1) | |
505 | lastshift = 1; | |
506 | ||
507 | addr = htlb_get_low_area(len, cursegs | segmask); | |
508 | if ((addr != -ENOMEM) | |
509 | && open_low_hpage_segs(current->mm, segmask) == 0) | |
510 | return addr; | |
511 | } | |
512 | printk(KERN_DEBUG "hugetlb_get_unmapped_area() unable to open" | |
513 | " enough segments\n"); | |
514 | return -ENOMEM; | |
515 | } else { | |
516 | return htlb_get_high_area(len); | |
517 | } | |
518 | } | |
519 | ||
520 | void hugetlb_mm_free_pgd(struct mm_struct *mm) | |
521 | { | |
522 | int i; | |
523 | pgd_t *pgdir; | |
524 | ||
525 | spin_lock(&mm->page_table_lock); | |
526 | ||
527 | pgdir = mm->context.huge_pgdir; | |
528 | if (! pgdir) | |
529 | goto out; | |
530 | ||
531 | mm->context.huge_pgdir = NULL; | |
532 | ||
533 | /* cleanup any hugepte pages leftover */ | |
534 | for (i = 0; i < PTRS_PER_HUGEPGD; i++) { | |
58366af5 | 535 | pud_t *pud = (pud_t *)(pgdir + i); |
1da177e4 | 536 | |
58366af5 BH |
537 | if (! pud_none(*pud)) { |
538 | pte_t *pte = (pte_t *)pud_page(*pud); | |
1da177e4 LT |
539 | struct page *ptepage = virt_to_page(pte); |
540 | ||
541 | ptepage->mapping = NULL; | |
542 | ||
543 | BUG_ON(memcmp(pte, empty_zero_page, PAGE_SIZE)); | |
544 | kmem_cache_free(zero_cache, pte); | |
545 | } | |
58366af5 | 546 | pud_clear(pud); |
1da177e4 LT |
547 | } |
548 | ||
549 | BUG_ON(memcmp(pgdir, empty_zero_page, PAGE_SIZE)); | |
550 | kmem_cache_free(zero_cache, pgdir); | |
551 | ||
552 | out: | |
553 | spin_unlock(&mm->page_table_lock); | |
554 | } | |
555 | ||
556 | int hash_huge_page(struct mm_struct *mm, unsigned long access, | |
557 | unsigned long ea, unsigned long vsid, int local) | |
558 | { | |
559 | pte_t *ptep; | |
560 | unsigned long va, vpn; | |
561 | pte_t old_pte, new_pte; | |
562 | unsigned long hpteflags, prpn; | |
563 | long slot; | |
564 | int err = 1; | |
565 | ||
566 | spin_lock(&mm->page_table_lock); | |
567 | ||
568 | ptep = huge_pte_offset(mm, ea); | |
569 | ||
570 | /* Search the Linux page table for a match with va */ | |
571 | va = (vsid << 28) | (ea & 0x0fffffff); | |
572 | vpn = va >> HPAGE_SHIFT; | |
573 | ||
574 | /* | |
575 | * If no pte found or not present, send the problem up to | |
576 | * do_page_fault | |
577 | */ | |
578 | if (unlikely(!ptep || pte_none(*ptep))) | |
579 | goto out; | |
580 | ||
581 | /* BUG_ON(pte_bad(*ptep)); */ | |
582 | ||
583 | /* | |
584 | * Check the user's access rights to the page. If access should be | |
585 | * prevented then send the problem up to do_page_fault. | |
586 | */ | |
587 | if (unlikely(access & ~pte_val(*ptep))) | |
588 | goto out; | |
589 | /* | |
590 | * At this point, we have a pte (old_pte) which can be used to build | |
591 | * or update an HPTE. There are 2 cases: | |
592 | * | |
593 | * 1. There is a valid (present) pte with no associated HPTE (this is | |
594 | * the most common case) | |
595 | * 2. There is a valid (present) pte with an associated HPTE. The | |
596 | * current values of the pp bits in the HPTE prevent access | |
597 | * because we are doing software DIRTY bit management and the | |
598 | * page is currently not DIRTY. | |
599 | */ | |
600 | ||
601 | ||
602 | old_pte = *ptep; | |
603 | new_pte = old_pte; | |
604 | ||
605 | hpteflags = 0x2 | (! (pte_val(new_pte) & _PAGE_RW)); | |
606 | /* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */ | |
607 | hpteflags |= ((pte_val(new_pte) & _PAGE_EXEC) ? 0 : HW_NO_EXEC); | |
608 | ||
609 | /* Check if pte already has an hpte (case 2) */ | |
610 | if (unlikely(pte_val(old_pte) & _PAGE_HASHPTE)) { | |
611 | /* There MIGHT be an HPTE for this pte */ | |
612 | unsigned long hash, slot; | |
613 | ||
614 | hash = hpt_hash(vpn, 1); | |
615 | if (pte_val(old_pte) & _PAGE_SECONDARY) | |
616 | hash = ~hash; | |
617 | slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; | |
618 | slot += (pte_val(old_pte) & _PAGE_GROUP_IX) >> 12; | |
619 | ||
620 | if (ppc_md.hpte_updatepp(slot, hpteflags, va, 1, local) == -1) | |
621 | pte_val(old_pte) &= ~_PAGE_HPTEFLAGS; | |
622 | } | |
623 | ||
624 | if (likely(!(pte_val(old_pte) & _PAGE_HASHPTE))) { | |
625 | unsigned long hash = hpt_hash(vpn, 1); | |
626 | unsigned long hpte_group; | |
627 | ||
628 | prpn = pte_pfn(old_pte); | |
629 | ||
630 | repeat: | |
631 | hpte_group = ((hash & htab_hash_mask) * | |
632 | HPTES_PER_GROUP) & ~0x7UL; | |
633 | ||
634 | /* Update the linux pte with the HPTE slot */ | |
635 | pte_val(new_pte) &= ~_PAGE_HPTEFLAGS; | |
636 | pte_val(new_pte) |= _PAGE_HASHPTE; | |
637 | ||
638 | /* Add in WIMG bits */ | |
639 | /* XXX We should store these in the pte */ | |
640 | hpteflags |= _PAGE_COHERENT; | |
641 | ||
642 | slot = ppc_md.hpte_insert(hpte_group, va, prpn, 0, | |
643 | hpteflags, 0, 1); | |
644 | ||
645 | /* Primary is full, try the secondary */ | |
646 | if (unlikely(slot == -1)) { | |
647 | pte_val(new_pte) |= _PAGE_SECONDARY; | |
648 | hpte_group = ((~hash & htab_hash_mask) * | |
649 | HPTES_PER_GROUP) & ~0x7UL; | |
650 | slot = ppc_md.hpte_insert(hpte_group, va, prpn, | |
651 | 1, hpteflags, 0, 1); | |
652 | if (slot == -1) { | |
653 | if (mftb() & 0x1) | |
654 | hpte_group = ((hash & htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL; | |
655 | ||
656 | ppc_md.hpte_remove(hpte_group); | |
657 | goto repeat; | |
658 | } | |
659 | } | |
660 | ||
661 | if (unlikely(slot == -2)) | |
662 | panic("hash_huge_page: pte_insert failed\n"); | |
663 | ||
664 | pte_val(new_pte) |= (slot<<12) & _PAGE_GROUP_IX; | |
665 | ||
666 | /* | |
667 | * No need to use ldarx/stdcx here because all who | |
668 | * might be updating the pte will hold the | |
669 | * page_table_lock | |
670 | */ | |
671 | *ptep = new_pte; | |
672 | } | |
673 | ||
674 | err = 0; | |
675 | ||
676 | out: | |
677 | spin_unlock(&mm->page_table_lock); | |
678 | ||
679 | return err; | |
680 | } |