Commit | Line | Data |
---|---|---|
1da177e4 LT |
1 | /* |
2 | * IA-32 Huge TLB Page Support for Kernel. | |
3 | * | |
4 | * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com> | |
5 | */ | |
6 | ||
1da177e4 LT |
7 | #include <linux/init.h> |
8 | #include <linux/fs.h> | |
9 | #include <linux/mm.h> | |
10 | #include <linux/hugetlb.h> | |
11 | #include <linux/pagemap.h> | |
1da177e4 LT |
12 | #include <linux/err.h> |
13 | #include <linux/sysctl.h> | |
14 | #include <asm/mman.h> | |
15 | #include <asm/tlb.h> | |
16 | #include <asm/tlbflush.h> | |
a5a19c63 | 17 | #include <asm/pgalloc.h> |
1da177e4 | 18 | |
39dde65c CK |
19 | static unsigned long page_table_shareable(struct vm_area_struct *svma, |
20 | struct vm_area_struct *vma, | |
21 | unsigned long addr, pgoff_t idx) | |
22 | { | |
23 | unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) + | |
24 | svma->vm_start; | |
25 | unsigned long sbase = saddr & PUD_MASK; | |
26 | unsigned long s_end = sbase + PUD_SIZE; | |
27 | ||
32b154c0 MG |
28 | /* Allow segments to share if only one is marked locked */ |
29 | unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED; | |
30 | unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED; | |
31 | ||
39dde65c CK |
32 | /* |
33 | * match the virtual addresses, permission and the alignment of the | |
34 | * page table page. | |
35 | */ | |
36 | if (pmd_index(addr) != pmd_index(saddr) || | |
32b154c0 | 37 | vm_flags != svm_flags || |
39dde65c CK |
38 | sbase < svma->vm_start || svma->vm_end < s_end) |
39 | return 0; | |
40 | ||
41 | return saddr; | |
42 | } | |
43 | ||
44 | static int vma_shareable(struct vm_area_struct *vma, unsigned long addr) | |
45 | { | |
46 | unsigned long base = addr & PUD_MASK; | |
47 | unsigned long end = base + PUD_SIZE; | |
48 | ||
49 | /* | |
50 | * check on proper vm_flags and page table alignment | |
51 | */ | |
52 | if (vma->vm_flags & VM_MAYSHARE && | |
53 | vma->vm_start <= base && end <= vma->vm_end) | |
54 | return 1; | |
55 | return 0; | |
56 | } | |
57 | ||
58 | /* | |
eb48c071 MH |
59 | * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc() |
60 | * and returns the corresponding pte. While this is not necessary for the | |
61 | * !shared pmd case because we can allocate the pmd later as well, it makes the | |
62 | * code much cleaner. pmd allocation is essential for the shared case because | |
63 | * pud has to be populated inside the same i_mmap_mutex section - otherwise | |
64 | * racing tasks could either miss the sharing (see huge_pte_offset) or select a | |
65 | * bad pmd for sharing. | |
39dde65c | 66 | */ |
eb48c071 MH |
67 | static pte_t * |
68 | huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) | |
39dde65c CK |
69 | { |
70 | struct vm_area_struct *vma = find_vma(mm, addr); | |
71 | struct address_space *mapping = vma->vm_file->f_mapping; | |
72 | pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + | |
73 | vma->vm_pgoff; | |
39dde65c CK |
74 | struct vm_area_struct *svma; |
75 | unsigned long saddr; | |
76 | pte_t *spte = NULL; | |
eb48c071 | 77 | pte_t *pte; |
39dde65c CK |
78 | |
79 | if (!vma_shareable(vma, addr)) | |
eb48c071 | 80 | return (pte_t *)pmd_alloc(mm, pud, addr); |
39dde65c | 81 | |
3d48ae45 | 82 | mutex_lock(&mapping->i_mmap_mutex); |
6b2dbba8 | 83 | vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) { |
39dde65c CK |
84 | if (svma == vma) |
85 | continue; | |
86 | ||
87 | saddr = page_table_shareable(svma, vma, addr, idx); | |
88 | if (saddr) { | |
89 | spte = huge_pte_offset(svma->vm_mm, saddr); | |
90 | if (spte) { | |
91 | get_page(virt_to_page(spte)); | |
92 | break; | |
93 | } | |
94 | } | |
95 | } | |
96 | ||
97 | if (!spte) | |
98 | goto out; | |
99 | ||
100 | spin_lock(&mm->page_table_lock); | |
101 | if (pud_none(*pud)) | |
a5a19c63 | 102 | pud_populate(mm, pud, (pmd_t *)((unsigned long)spte & PAGE_MASK)); |
39dde65c CK |
103 | else |
104 | put_page(virt_to_page(spte)); | |
105 | spin_unlock(&mm->page_table_lock); | |
106 | out: | |
eb48c071 | 107 | pte = (pte_t *)pmd_alloc(mm, pud, addr); |
3d48ae45 | 108 | mutex_unlock(&mapping->i_mmap_mutex); |
eb48c071 | 109 | return pte; |
39dde65c CK |
110 | } |
111 | ||
112 | /* | |
113 | * unmap huge page backed by shared pte. | |
114 | * | |
115 | * Hugetlb pte page is ref counted at the time of mapping. If pte is shared | |
116 | * indicated by page_count > 1, unmap is achieved by clearing pud and | |
117 | * decrementing the ref count. If count == 1, the pte page is not shared. | |
118 | * | |
119 | * called with vma->vm_mm->page_table_lock held. | |
120 | * | |
121 | * returns: 1 successfully unmapped a shared pte page | |
122 | * 0 the underlying pte page is not shared, or it is the last user | |
123 | */ | |
124 | int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) | |
125 | { | |
126 | pgd_t *pgd = pgd_offset(mm, *addr); | |
127 | pud_t *pud = pud_offset(pgd, *addr); | |
128 | ||
129 | BUG_ON(page_count(virt_to_page(ptep)) == 0); | |
130 | if (page_count(virt_to_page(ptep)) == 1) | |
131 | return 0; | |
132 | ||
133 | pud_clear(pud); | |
134 | put_page(virt_to_page(ptep)); | |
135 | *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE; | |
136 | return 1; | |
137 | } | |
138 | ||
a5516438 AK |
139 | pte_t *huge_pte_alloc(struct mm_struct *mm, |
140 | unsigned long addr, unsigned long sz) | |
1da177e4 LT |
141 | { |
142 | pgd_t *pgd; | |
143 | pud_t *pud; | |
7bf07f3d | 144 | pte_t *pte = NULL; |
1da177e4 LT |
145 | |
146 | pgd = pgd_offset(mm, addr); | |
147 | pud = pud_alloc(mm, pgd, addr); | |
39dde65c | 148 | if (pud) { |
39c11e6c AK |
149 | if (sz == PUD_SIZE) { |
150 | pte = (pte_t *)pud; | |
151 | } else { | |
152 | BUG_ON(sz != PMD_SIZE); | |
153 | if (pud_none(*pud)) | |
eb48c071 MH |
154 | pte = huge_pmd_share(mm, addr, pud); |
155 | else | |
156 | pte = (pte_t *)pmd_alloc(mm, pud, addr); | |
39c11e6c | 157 | } |
39dde65c | 158 | } |
0e5c9f39 | 159 | BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte)); |
7bf07f3d | 160 | |
7bf07f3d | 161 | return pte; |
1da177e4 LT |
162 | } |
163 | ||
63551ae0 | 164 | pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) |
1da177e4 LT |
165 | { |
166 | pgd_t *pgd; | |
167 | pud_t *pud; | |
168 | pmd_t *pmd = NULL; | |
169 | ||
170 | pgd = pgd_offset(mm, addr); | |
02b0ccef AL |
171 | if (pgd_present(*pgd)) { |
172 | pud = pud_offset(pgd, addr); | |
39c11e6c AK |
173 | if (pud_present(*pud)) { |
174 | if (pud_large(*pud)) | |
175 | return (pte_t *)pud; | |
02b0ccef | 176 | pmd = pmd_offset(pud, addr); |
39c11e6c | 177 | } |
02b0ccef | 178 | } |
1da177e4 LT |
179 | return (pte_t *) pmd; |
180 | } | |
181 | ||
1da177e4 LT |
182 | #if 0 /* This is just for testing */ |
183 | struct page * | |
184 | follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) | |
185 | { | |
186 | unsigned long start = address; | |
187 | int length = 1; | |
188 | int nr; | |
189 | struct page *page; | |
190 | struct vm_area_struct *vma; | |
191 | ||
192 | vma = find_vma(mm, addr); | |
193 | if (!vma || !is_vm_hugetlb_page(vma)) | |
194 | return ERR_PTR(-EINVAL); | |
195 | ||
196 | pte = huge_pte_offset(mm, address); | |
197 | ||
198 | /* hugetlb should be locked, and hence, prefaulted */ | |
199 | WARN_ON(!pte || pte_none(*pte)); | |
200 | ||
201 | page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)]; | |
202 | ||
25e59881 | 203 | WARN_ON(!PageHead(page)); |
1da177e4 LT |
204 | |
205 | return page; | |
206 | } | |
207 | ||
208 | int pmd_huge(pmd_t pmd) | |
209 | { | |
210 | return 0; | |
211 | } | |
212 | ||
ceb86879 AK |
213 | int pud_huge(pud_t pud) |
214 | { | |
215 | return 0; | |
216 | } | |
217 | ||
1da177e4 LT |
218 | struct page * |
219 | follow_huge_pmd(struct mm_struct *mm, unsigned long address, | |
220 | pmd_t *pmd, int write) | |
221 | { | |
222 | return NULL; | |
223 | } | |
224 | ||
225 | #else | |
226 | ||
227 | struct page * | |
228 | follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) | |
229 | { | |
230 | return ERR_PTR(-EINVAL); | |
231 | } | |
232 | ||
233 | int pmd_huge(pmd_t pmd) | |
234 | { | |
235 | return !!(pmd_val(pmd) & _PAGE_PSE); | |
236 | } | |
237 | ||
ceb86879 AK |
238 | int pud_huge(pud_t pud) |
239 | { | |
39c11e6c | 240 | return !!(pud_val(pud) & _PAGE_PSE); |
ceb86879 AK |
241 | } |
242 | ||
1da177e4 LT |
243 | struct page * |
244 | follow_huge_pmd(struct mm_struct *mm, unsigned long address, | |
245 | pmd_t *pmd, int write) | |
246 | { | |
247 | struct page *page; | |
248 | ||
249 | page = pte_page(*(pte_t *)pmd); | |
250 | if (page) | |
ceb86879 | 251 | page += ((address & ~PMD_MASK) >> PAGE_SHIFT); |
1da177e4 LT |
252 | return page; |
253 | } | |
ceb86879 AK |
254 | |
255 | struct page * | |
256 | follow_huge_pud(struct mm_struct *mm, unsigned long address, | |
257 | pud_t *pud, int write) | |
258 | { | |
259 | struct page *page; | |
260 | ||
261 | page = pte_page(*(pte_t *)pud); | |
262 | if (page) | |
263 | page += ((address & ~PUD_MASK) >> PAGE_SHIFT); | |
264 | return page; | |
265 | } | |
266 | ||
1da177e4 LT |
267 | #endif |
268 | ||
1da177e4 LT |
269 | /* x86_64 also uses this file */ |
270 | ||
271 | #ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA | |
272 | static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file, | |
273 | unsigned long addr, unsigned long len, | |
274 | unsigned long pgoff, unsigned long flags) | |
275 | { | |
39c11e6c | 276 | struct hstate *h = hstate_file(file); |
cdc17344 ML |
277 | struct vm_unmapped_area_info info; |
278 | ||
279 | info.flags = 0; | |
280 | info.length = len; | |
281 | info.low_limit = TASK_UNMAPPED_BASE; | |
282 | info.high_limit = TASK_SIZE; | |
283 | info.align_mask = PAGE_MASK & ~huge_page_mask(h); | |
284 | info.align_offset = 0; | |
285 | return vm_unmapped_area(&info); | |
1da177e4 LT |
286 | } |
287 | ||
288 | static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file, | |
289 | unsigned long addr0, unsigned long len, | |
290 | unsigned long pgoff, unsigned long flags) | |
291 | { | |
39c11e6c | 292 | struct hstate *h = hstate_file(file); |
cdc17344 ML |
293 | struct vm_unmapped_area_info info; |
294 | unsigned long addr; | |
1da177e4 | 295 | |
cdc17344 ML |
296 | info.flags = VM_UNMAPPED_AREA_TOPDOWN; |
297 | info.length = len; | |
298 | info.low_limit = PAGE_SIZE; | |
299 | info.high_limit = current->mm->mmap_base; | |
300 | info.align_mask = PAGE_MASK & ~huge_page_mask(h); | |
301 | info.align_offset = 0; | |
302 | addr = vm_unmapped_area(&info); | |
1da177e4 | 303 | |
1da177e4 LT |
304 | /* |
305 | * A failed mmap() very likely causes application failure, | |
306 | * so fall back to the bottom-up function here. This scenario | |
307 | * can happen with large stack limits and large mmap() | |
308 | * allocations. | |
309 | */ | |
cdc17344 ML |
310 | if (addr & ~PAGE_MASK) { |
311 | VM_BUG_ON(addr != -ENOMEM); | |
312 | info.flags = 0; | |
313 | info.low_limit = TASK_UNMAPPED_BASE; | |
314 | info.high_limit = TASK_SIZE; | |
315 | addr = vm_unmapped_area(&info); | |
316 | } | |
1da177e4 LT |
317 | |
318 | return addr; | |
319 | } | |
320 | ||
321 | unsigned long | |
322 | hugetlb_get_unmapped_area(struct file *file, unsigned long addr, | |
323 | unsigned long len, unsigned long pgoff, unsigned long flags) | |
324 | { | |
39c11e6c | 325 | struct hstate *h = hstate_file(file); |
1da177e4 LT |
326 | struct mm_struct *mm = current->mm; |
327 | struct vm_area_struct *vma; | |
328 | ||
39c11e6c | 329 | if (len & ~huge_page_mask(h)) |
1da177e4 LT |
330 | return -EINVAL; |
331 | if (len > TASK_SIZE) | |
332 | return -ENOMEM; | |
333 | ||
5a8130f2 | 334 | if (flags & MAP_FIXED) { |
a5516438 | 335 | if (prepare_hugepage_range(file, addr, len)) |
5a8130f2 BH |
336 | return -EINVAL; |
337 | return addr; | |
338 | } | |
339 | ||
1da177e4 | 340 | if (addr) { |
39c11e6c | 341 | addr = ALIGN(addr, huge_page_size(h)); |
1da177e4 LT |
342 | vma = find_vma(mm, addr); |
343 | if (TASK_SIZE - len >= addr && | |
344 | (!vma || addr + len <= vma->vm_start)) | |
345 | return addr; | |
346 | } | |
347 | if (mm->get_unmapped_area == arch_get_unmapped_area) | |
348 | return hugetlb_get_unmapped_area_bottomup(file, addr, len, | |
349 | pgoff, flags); | |
350 | else | |
351 | return hugetlb_get_unmapped_area_topdown(file, addr, len, | |
352 | pgoff, flags); | |
353 | } | |
354 | ||
355 | #endif /*HAVE_ARCH_HUGETLB_UNMAPPED_AREA*/ | |
356 | ||
b4718e62 AK |
357 | #ifdef CONFIG_X86_64 |
358 | static __init int setup_hugepagesz(char *opt) | |
359 | { | |
360 | unsigned long ps = memparse(opt, &opt); | |
361 | if (ps == PMD_SIZE) { | |
362 | hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT); | |
363 | } else if (ps == PUD_SIZE && cpu_has_gbpages) { | |
364 | hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT); | |
365 | } else { | |
366 | printk(KERN_ERR "hugepagesz: Unsupported page size %lu M\n", | |
367 | ps >> 20); | |
368 | return 0; | |
369 | } | |
370 | return 1; | |
371 | } | |
372 | __setup("hugepagesz=", setup_hugepagesz); | |
373 | #endif |