Commit | Line | Data |
---|---|---|
1da177e4 LT |
1 | /* |
2 | * IA-32 Huge TLB Page Support for Kernel. | |
3 | * | |
4 | * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com> | |
5 | */ | |
6 | ||
1da177e4 LT |
7 | #include <linux/init.h> |
8 | #include <linux/fs.h> | |
9 | #include <linux/mm.h> | |
10 | #include <linux/hugetlb.h> | |
11 | #include <linux/pagemap.h> | |
1da177e4 LT |
12 | #include <linux/err.h> |
13 | #include <linux/sysctl.h> | |
14 | #include <asm/mman.h> | |
15 | #include <asm/tlb.h> | |
16 | #include <asm/tlbflush.h> | |
a5a19c63 | 17 | #include <asm/pgalloc.h> |
1da177e4 | 18 | |
39dde65c CK |
19 | static unsigned long page_table_shareable(struct vm_area_struct *svma, |
20 | struct vm_area_struct *vma, | |
21 | unsigned long addr, pgoff_t idx) | |
22 | { | |
23 | unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) + | |
24 | svma->vm_start; | |
25 | unsigned long sbase = saddr & PUD_MASK; | |
26 | unsigned long s_end = sbase + PUD_SIZE; | |
27 | ||
32b154c0 MG |
28 | /* Allow segments to share if only one is marked locked */ |
29 | unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED; | |
30 | unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED; | |
31 | ||
39dde65c CK |
32 | /* |
33 | * match the virtual addresses, permission and the alignment of the | |
34 | * page table page. | |
35 | */ | |
36 | if (pmd_index(addr) != pmd_index(saddr) || | |
32b154c0 | 37 | vm_flags != svm_flags || |
39dde65c CK |
38 | sbase < svma->vm_start || svma->vm_end < s_end) |
39 | return 0; | |
40 | ||
41 | return saddr; | |
42 | } | |
43 | ||
44 | static int vma_shareable(struct vm_area_struct *vma, unsigned long addr) | |
45 | { | |
46 | unsigned long base = addr & PUD_MASK; | |
47 | unsigned long end = base + PUD_SIZE; | |
48 | ||
49 | /* | |
50 | * check on proper vm_flags and page table alignment | |
51 | */ | |
52 | if (vma->vm_flags & VM_MAYSHARE && | |
53 | vma->vm_start <= base && end <= vma->vm_end) | |
54 | return 1; | |
55 | return 0; | |
56 | } | |
57 | ||
58 | /* | |
59 | * search for a shareable pmd page for hugetlb. | |
60 | */ | |
61 | static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) | |
62 | { | |
63 | struct vm_area_struct *vma = find_vma(mm, addr); | |
64 | struct address_space *mapping = vma->vm_file->f_mapping; | |
65 | pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + | |
66 | vma->vm_pgoff; | |
67 | struct prio_tree_iter iter; | |
68 | struct vm_area_struct *svma; | |
69 | unsigned long saddr; | |
70 | pte_t *spte = NULL; | |
71 | ||
72 | if (!vma_shareable(vma, addr)) | |
73 | return; | |
74 | ||
3d48ae45 | 75 | mutex_lock(&mapping->i_mmap_mutex); |
39dde65c CK |
76 | vma_prio_tree_foreach(svma, &iter, &mapping->i_mmap, idx, idx) { |
77 | if (svma == vma) | |
78 | continue; | |
79 | ||
80 | saddr = page_table_shareable(svma, vma, addr, idx); | |
81 | if (saddr) { | |
82 | spte = huge_pte_offset(svma->vm_mm, saddr); | |
83 | if (spte) { | |
84 | get_page(virt_to_page(spte)); | |
85 | break; | |
86 | } | |
87 | } | |
88 | } | |
89 | ||
90 | if (!spte) | |
91 | goto out; | |
92 | ||
93 | spin_lock(&mm->page_table_lock); | |
94 | if (pud_none(*pud)) | |
a5a19c63 | 95 | pud_populate(mm, pud, (pmd_t *)((unsigned long)spte & PAGE_MASK)); |
39dde65c CK |
96 | else |
97 | put_page(virt_to_page(spte)); | |
98 | spin_unlock(&mm->page_table_lock); | |
99 | out: | |
3d48ae45 | 100 | mutex_unlock(&mapping->i_mmap_mutex); |
39dde65c CK |
101 | } |
102 | ||
103 | /* | |
104 | * unmap huge page backed by shared pte. | |
105 | * | |
106 | * Hugetlb pte page is ref counted at the time of mapping. If pte is shared | |
107 | * indicated by page_count > 1, unmap is achieved by clearing pud and | |
108 | * decrementing the ref count. If count == 1, the pte page is not shared. | |
109 | * | |
110 | * called with vma->vm_mm->page_table_lock held. | |
111 | * | |
112 | * returns: 1 successfully unmapped a shared pte page | |
113 | * 0 the underlying pte page is not shared, or it is the last user | |
114 | */ | |
115 | int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) | |
116 | { | |
117 | pgd_t *pgd = pgd_offset(mm, *addr); | |
118 | pud_t *pud = pud_offset(pgd, *addr); | |
119 | ||
120 | BUG_ON(page_count(virt_to_page(ptep)) == 0); | |
121 | if (page_count(virt_to_page(ptep)) == 1) | |
122 | return 0; | |
123 | ||
124 | pud_clear(pud); | |
125 | put_page(virt_to_page(ptep)); | |
126 | *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE; | |
127 | return 1; | |
128 | } | |
129 | ||
a5516438 AK |
130 | pte_t *huge_pte_alloc(struct mm_struct *mm, |
131 | unsigned long addr, unsigned long sz) | |
1da177e4 LT |
132 | { |
133 | pgd_t *pgd; | |
134 | pud_t *pud; | |
7bf07f3d | 135 | pte_t *pte = NULL; |
1da177e4 LT |
136 | |
137 | pgd = pgd_offset(mm, addr); | |
138 | pud = pud_alloc(mm, pgd, addr); | |
39dde65c | 139 | if (pud) { |
39c11e6c AK |
140 | if (sz == PUD_SIZE) { |
141 | pte = (pte_t *)pud; | |
142 | } else { | |
143 | BUG_ON(sz != PMD_SIZE); | |
144 | if (pud_none(*pud)) | |
145 | huge_pmd_share(mm, addr, pud); | |
146 | pte = (pte_t *) pmd_alloc(mm, pud, addr); | |
147 | } | |
39dde65c | 148 | } |
0e5c9f39 | 149 | BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte)); |
7bf07f3d | 150 | |
7bf07f3d | 151 | return pte; |
1da177e4 LT |
152 | } |
153 | ||
63551ae0 | 154 | pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) |
1da177e4 LT |
155 | { |
156 | pgd_t *pgd; | |
157 | pud_t *pud; | |
158 | pmd_t *pmd = NULL; | |
159 | ||
160 | pgd = pgd_offset(mm, addr); | |
02b0ccef AL |
161 | if (pgd_present(*pgd)) { |
162 | pud = pud_offset(pgd, addr); | |
39c11e6c AK |
163 | if (pud_present(*pud)) { |
164 | if (pud_large(*pud)) | |
165 | return (pte_t *)pud; | |
02b0ccef | 166 | pmd = pmd_offset(pud, addr); |
39c11e6c | 167 | } |
02b0ccef | 168 | } |
1da177e4 LT |
169 | return (pte_t *) pmd; |
170 | } | |
171 | ||
1da177e4 LT |
172 | #if 0 /* This is just for testing */ |
173 | struct page * | |
174 | follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) | |
175 | { | |
176 | unsigned long start = address; | |
177 | int length = 1; | |
178 | int nr; | |
179 | struct page *page; | |
180 | struct vm_area_struct *vma; | |
181 | ||
182 | vma = find_vma(mm, addr); | |
183 | if (!vma || !is_vm_hugetlb_page(vma)) | |
184 | return ERR_PTR(-EINVAL); | |
185 | ||
186 | pte = huge_pte_offset(mm, address); | |
187 | ||
188 | /* hugetlb should be locked, and hence, prefaulted */ | |
189 | WARN_ON(!pte || pte_none(*pte)); | |
190 | ||
191 | page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)]; | |
192 | ||
25e59881 | 193 | WARN_ON(!PageHead(page)); |
1da177e4 LT |
194 | |
195 | return page; | |
196 | } | |
197 | ||
198 | int pmd_huge(pmd_t pmd) | |
199 | { | |
200 | return 0; | |
201 | } | |
202 | ||
ceb86879 AK |
203 | int pud_huge(pud_t pud) |
204 | { | |
205 | return 0; | |
206 | } | |
207 | ||
1da177e4 LT |
208 | struct page * |
209 | follow_huge_pmd(struct mm_struct *mm, unsigned long address, | |
210 | pmd_t *pmd, int write) | |
211 | { | |
212 | return NULL; | |
213 | } | |
214 | ||
215 | #else | |
216 | ||
217 | struct page * | |
218 | follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) | |
219 | { | |
220 | return ERR_PTR(-EINVAL); | |
221 | } | |
222 | ||
223 | int pmd_huge(pmd_t pmd) | |
224 | { | |
225 | return !!(pmd_val(pmd) & _PAGE_PSE); | |
226 | } | |
227 | ||
ceb86879 AK |
228 | int pud_huge(pud_t pud) |
229 | { | |
39c11e6c | 230 | return !!(pud_val(pud) & _PAGE_PSE); |
ceb86879 AK |
231 | } |
232 | ||
1da177e4 LT |
233 | struct page * |
234 | follow_huge_pmd(struct mm_struct *mm, unsigned long address, | |
235 | pmd_t *pmd, int write) | |
236 | { | |
237 | struct page *page; | |
238 | ||
239 | page = pte_page(*(pte_t *)pmd); | |
240 | if (page) | |
ceb86879 | 241 | page += ((address & ~PMD_MASK) >> PAGE_SHIFT); |
1da177e4 LT |
242 | return page; |
243 | } | |
ceb86879 AK |
244 | |
245 | struct page * | |
246 | follow_huge_pud(struct mm_struct *mm, unsigned long address, | |
247 | pud_t *pud, int write) | |
248 | { | |
249 | struct page *page; | |
250 | ||
251 | page = pte_page(*(pte_t *)pud); | |
252 | if (page) | |
253 | page += ((address & ~PUD_MASK) >> PAGE_SHIFT); | |
254 | return page; | |
255 | } | |
256 | ||
1da177e4 LT |
257 | #endif |
258 | ||
1da177e4 LT |
259 | /* x86_64 also uses this file */ |
260 | ||
261 | #ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA | |
262 | static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file, | |
263 | unsigned long addr, unsigned long len, | |
264 | unsigned long pgoff, unsigned long flags) | |
265 | { | |
39c11e6c | 266 | struct hstate *h = hstate_file(file); |
1da177e4 LT |
267 | struct mm_struct *mm = current->mm; |
268 | struct vm_area_struct *vma; | |
269 | unsigned long start_addr; | |
270 | ||
1363c3cd WW |
271 | if (len > mm->cached_hole_size) { |
272 | start_addr = mm->free_area_cache; | |
273 | } else { | |
274 | start_addr = TASK_UNMAPPED_BASE; | |
275 | mm->cached_hole_size = 0; | |
276 | } | |
1da177e4 LT |
277 | |
278 | full_search: | |
39c11e6c | 279 | addr = ALIGN(start_addr, huge_page_size(h)); |
1da177e4 LT |
280 | |
281 | for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { | |
282 | /* At this point: (!vma || addr < vma->vm_end). */ | |
283 | if (TASK_SIZE - len < addr) { | |
284 | /* | |
285 | * Start a new search - just in case we missed | |
286 | * some holes. | |
287 | */ | |
288 | if (start_addr != TASK_UNMAPPED_BASE) { | |
289 | start_addr = TASK_UNMAPPED_BASE; | |
1363c3cd | 290 | mm->cached_hole_size = 0; |
1da177e4 LT |
291 | goto full_search; |
292 | } | |
293 | return -ENOMEM; | |
294 | } | |
295 | if (!vma || addr + len <= vma->vm_start) { | |
296 | mm->free_area_cache = addr + len; | |
297 | return addr; | |
298 | } | |
1363c3cd WW |
299 | if (addr + mm->cached_hole_size < vma->vm_start) |
300 | mm->cached_hole_size = vma->vm_start - addr; | |
39c11e6c | 301 | addr = ALIGN(vma->vm_end, huge_page_size(h)); |
1da177e4 LT |
302 | } |
303 | } | |
304 | ||
305 | static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file, | |
306 | unsigned long addr0, unsigned long len, | |
307 | unsigned long pgoff, unsigned long flags) | |
308 | { | |
39c11e6c | 309 | struct hstate *h = hstate_file(file); |
1da177e4 LT |
310 | struct mm_struct *mm = current->mm; |
311 | struct vm_area_struct *vma, *prev_vma; | |
312 | unsigned long base = mm->mmap_base, addr = addr0; | |
1363c3cd | 313 | unsigned long largest_hole = mm->cached_hole_size; |
1da177e4 LT |
314 | int first_time = 1; |
315 | ||
316 | /* don't allow allocations above current base */ | |
317 | if (mm->free_area_cache > base) | |
318 | mm->free_area_cache = base; | |
319 | ||
1363c3cd WW |
320 | if (len <= largest_hole) { |
321 | largest_hole = 0; | |
322 | mm->free_area_cache = base; | |
323 | } | |
1da177e4 LT |
324 | try_again: |
325 | /* make sure it can fit in the remaining address space */ | |
326 | if (mm->free_area_cache < len) | |
327 | goto fail; | |
328 | ||
0d2eb44f | 329 | /* either no address requested or can't fit in requested address hole */ |
39c11e6c | 330 | addr = (mm->free_area_cache - len) & huge_page_mask(h); |
1da177e4 LT |
331 | do { |
332 | /* | |
333 | * Lookup failure means no vma is above this address, | |
334 | * i.e. return with success: | |
335 | */ | |
336 | if (!(vma = find_vma_prev(mm, addr, &prev_vma))) | |
337 | return addr; | |
338 | ||
339 | /* | |
340 | * new region fits between prev_vma->vm_end and | |
341 | * vma->vm_start, use it: | |
342 | */ | |
343 | if (addr + len <= vma->vm_start && | |
1363c3cd | 344 | (!prev_vma || (addr >= prev_vma->vm_end))) { |
1da177e4 | 345 | /* remember the address as a hint for next time */ |
1363c3cd WW |
346 | mm->cached_hole_size = largest_hole; |
347 | return (mm->free_area_cache = addr); | |
348 | } else { | |
1da177e4 | 349 | /* pull free_area_cache down to the first hole */ |
1363c3cd | 350 | if (mm->free_area_cache == vma->vm_end) { |
1da177e4 | 351 | mm->free_area_cache = vma->vm_start; |
1363c3cd WW |
352 | mm->cached_hole_size = largest_hole; |
353 | } | |
354 | } | |
355 | ||
356 | /* remember the largest hole we saw so far */ | |
357 | if (addr + largest_hole < vma->vm_start) | |
358 | largest_hole = vma->vm_start - addr; | |
1da177e4 LT |
359 | |
360 | /* try just below the current vma->vm_start */ | |
39c11e6c | 361 | addr = (vma->vm_start - len) & huge_page_mask(h); |
1da177e4 LT |
362 | } while (len <= vma->vm_start); |
363 | ||
364 | fail: | |
365 | /* | |
366 | * if hint left us with no space for the requested | |
367 | * mapping then try again: | |
368 | */ | |
369 | if (first_time) { | |
370 | mm->free_area_cache = base; | |
1363c3cd | 371 | largest_hole = 0; |
1da177e4 LT |
372 | first_time = 0; |
373 | goto try_again; | |
374 | } | |
375 | /* | |
376 | * A failed mmap() very likely causes application failure, | |
377 | * so fall back to the bottom-up function here. This scenario | |
378 | * can happen with large stack limits and large mmap() | |
379 | * allocations. | |
380 | */ | |
381 | mm->free_area_cache = TASK_UNMAPPED_BASE; | |
1363c3cd | 382 | mm->cached_hole_size = ~0UL; |
1da177e4 LT |
383 | addr = hugetlb_get_unmapped_area_bottomup(file, addr0, |
384 | len, pgoff, flags); | |
385 | ||
386 | /* | |
387 | * Restore the topdown base: | |
388 | */ | |
389 | mm->free_area_cache = base; | |
1363c3cd | 390 | mm->cached_hole_size = ~0UL; |
1da177e4 LT |
391 | |
392 | return addr; | |
393 | } | |
394 | ||
395 | unsigned long | |
396 | hugetlb_get_unmapped_area(struct file *file, unsigned long addr, | |
397 | unsigned long len, unsigned long pgoff, unsigned long flags) | |
398 | { | |
39c11e6c | 399 | struct hstate *h = hstate_file(file); |
1da177e4 LT |
400 | struct mm_struct *mm = current->mm; |
401 | struct vm_area_struct *vma; | |
402 | ||
39c11e6c | 403 | if (len & ~huge_page_mask(h)) |
1da177e4 LT |
404 | return -EINVAL; |
405 | if (len > TASK_SIZE) | |
406 | return -ENOMEM; | |
407 | ||
5a8130f2 | 408 | if (flags & MAP_FIXED) { |
a5516438 | 409 | if (prepare_hugepage_range(file, addr, len)) |
5a8130f2 BH |
410 | return -EINVAL; |
411 | return addr; | |
412 | } | |
413 | ||
1da177e4 | 414 | if (addr) { |
39c11e6c | 415 | addr = ALIGN(addr, huge_page_size(h)); |
1da177e4 LT |
416 | vma = find_vma(mm, addr); |
417 | if (TASK_SIZE - len >= addr && | |
418 | (!vma || addr + len <= vma->vm_start)) | |
419 | return addr; | |
420 | } | |
421 | if (mm->get_unmapped_area == arch_get_unmapped_area) | |
422 | return hugetlb_get_unmapped_area_bottomup(file, addr, len, | |
423 | pgoff, flags); | |
424 | else | |
425 | return hugetlb_get_unmapped_area_topdown(file, addr, len, | |
426 | pgoff, flags); | |
427 | } | |
428 | ||
429 | #endif /*HAVE_ARCH_HUGETLB_UNMAPPED_AREA*/ | |
430 | ||
b4718e62 AK |
431 | #ifdef CONFIG_X86_64 |
432 | static __init int setup_hugepagesz(char *opt) | |
433 | { | |
434 | unsigned long ps = memparse(opt, &opt); | |
435 | if (ps == PMD_SIZE) { | |
436 | hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT); | |
437 | } else if (ps == PUD_SIZE && cpu_has_gbpages) { | |
438 | hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT); | |
439 | } else { | |
440 | printk(KERN_ERR "hugepagesz: Unsupported page size %lu M\n", | |
441 | ps >> 20); | |
442 | return 0; | |
443 | } | |
444 | return 1; | |
445 | } | |
446 | __setup("hugepagesz=", setup_hugepagesz); | |
447 | #endif |