2 * Copyright IBM Corp. 2007, 2011
3 * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
6 #include <linux/sched.h>
7 #include <linux/kernel.h>
8 #include <linux/errno.h>
11 #include <linux/swap.h>
12 #include <linux/smp.h>
13 #include <linux/highmem.h>
14 #include <linux/pagemap.h>
15 #include <linux/spinlock.h>
16 #include <linux/module.h>
17 #include <linux/quicklist.h>
18 #include <linux/rcupdate.h>
19 #include <linux/slab.h>
21 #include <asm/pgtable.h>
22 #include <asm/pgalloc.h>
24 #include <asm/tlbflush.h>
25 #include <asm/mmu_context.h>
29 #define FRAG_MASK 0x0f
32 #define FRAG_MASK 0x03
36 unsigned long *crst_table_alloc(struct mm_struct
*mm
)
38 struct page
*page
= alloc_pages(GFP_KERNEL
, ALLOC_ORDER
);
42 return (unsigned long *) page_to_phys(page
);
45 void crst_table_free(struct mm_struct
*mm
, unsigned long *table
)
47 free_pages((unsigned long) table
, ALLOC_ORDER
);
51 static void __crst_table_upgrade(void *arg
)
53 struct mm_struct
*mm
= arg
;
55 if (current
->active_mm
== mm
)
56 update_mm(mm
, current
);
60 int crst_table_upgrade(struct mm_struct
*mm
, unsigned long limit
)
62 unsigned long *table
, *pgd
;
66 BUG_ON(limit
> (1UL << 53));
69 table
= crst_table_alloc(mm
);
72 spin_lock_bh(&mm
->page_table_lock
);
73 if (mm
->context
.asce_limit
< limit
) {
74 pgd
= (unsigned long *) mm
->pgd
;
75 if (mm
->context
.asce_limit
<= (1UL << 31)) {
76 entry
= _REGION3_ENTRY_EMPTY
;
77 mm
->context
.asce_limit
= 1UL << 42;
78 mm
->context
.asce_bits
= _ASCE_TABLE_LENGTH
|
82 entry
= _REGION2_ENTRY_EMPTY
;
83 mm
->context
.asce_limit
= 1UL << 53;
84 mm
->context
.asce_bits
= _ASCE_TABLE_LENGTH
|
88 crst_table_init(table
, entry
);
89 pgd_populate(mm
, (pgd_t
*) table
, (pud_t
*) pgd
);
90 mm
->pgd
= (pgd_t
*) table
;
91 mm
->task_size
= mm
->context
.asce_limit
;
95 spin_unlock_bh(&mm
->page_table_lock
);
97 crst_table_free(mm
, table
);
98 if (mm
->context
.asce_limit
< limit
)
101 on_each_cpu(__crst_table_upgrade
, mm
, 0);
105 void crst_table_downgrade(struct mm_struct
*mm
, unsigned long limit
)
109 if (current
->active_mm
== mm
)
111 while (mm
->context
.asce_limit
> limit
) {
113 switch (pgd_val(*pgd
) & _REGION_ENTRY_TYPE_MASK
) {
114 case _REGION_ENTRY_TYPE_R2
:
115 mm
->context
.asce_limit
= 1UL << 42;
116 mm
->context
.asce_bits
= _ASCE_TABLE_LENGTH
|
120 case _REGION_ENTRY_TYPE_R3
:
121 mm
->context
.asce_limit
= 1UL << 31;
122 mm
->context
.asce_bits
= _ASCE_TABLE_LENGTH
|
129 mm
->pgd
= (pgd_t
*) (pgd_val(*pgd
) & _REGION_ENTRY_ORIGIN
);
130 mm
->task_size
= mm
->context
.asce_limit
;
131 crst_table_free(mm
, (unsigned long *) pgd
);
133 if (current
->active_mm
== mm
)
134 update_mm(mm
, current
);
141 * gmap_alloc - allocate a guest address space
142 * @mm: pointer to the parent mm_struct
144 * Returns a guest address space structure.
146 struct gmap
*gmap_alloc(struct mm_struct
*mm
)
150 unsigned long *table
;
152 gmap
= kzalloc(sizeof(struct gmap
), GFP_KERNEL
);
155 INIT_LIST_HEAD(&gmap
->crst_list
);
157 page
= alloc_pages(GFP_KERNEL
, ALLOC_ORDER
);
160 list_add(&page
->lru
, &gmap
->crst_list
);
161 table
= (unsigned long *) page_to_phys(page
);
162 crst_table_init(table
, _REGION1_ENTRY_EMPTY
);
164 gmap
->asce
= _ASCE_TYPE_REGION1
| _ASCE_TABLE_LENGTH
|
165 _ASCE_USER_BITS
| __pa(table
);
166 list_add(&gmap
->list
, &mm
->context
.gmap_list
);
174 EXPORT_SYMBOL_GPL(gmap_alloc
);
176 static int gmap_unlink_segment(struct gmap
*gmap
, unsigned long *table
)
178 struct gmap_pgtable
*mp
;
179 struct gmap_rmap
*rmap
;
182 if (*table
& _SEGMENT_ENTRY_INVALID
)
184 page
= pfn_to_page(*table
>> PAGE_SHIFT
);
185 mp
= (struct gmap_pgtable
*) page
->index
;
186 list_for_each_entry(rmap
, &mp
->mapper
, list
) {
187 if (rmap
->entry
!= table
)
189 list_del(&rmap
->list
);
193 *table
= mp
->vmaddr
| _SEGMENT_ENTRY_INVALID
| _SEGMENT_ENTRY_PROTECT
;
197 static void gmap_flush_tlb(struct gmap
*gmap
)
199 if (MACHINE_HAS_IDTE
)
200 __tlb_flush_idte((unsigned long) gmap
->table
|
203 __tlb_flush_global();
207 * gmap_free - free a guest address space
208 * @gmap: pointer to the guest address space structure
210 void gmap_free(struct gmap
*gmap
)
212 struct page
*page
, *next
;
213 unsigned long *table
;
218 if (MACHINE_HAS_IDTE
)
219 __tlb_flush_idte((unsigned long) gmap
->table
|
222 __tlb_flush_global();
224 /* Free all segment & region tables. */
225 down_read(&gmap
->mm
->mmap_sem
);
226 spin_lock(&gmap
->mm
->page_table_lock
);
227 list_for_each_entry_safe(page
, next
, &gmap
->crst_list
, lru
) {
228 table
= (unsigned long *) page_to_phys(page
);
229 if ((*table
& _REGION_ENTRY_TYPE_MASK
) == 0)
230 /* Remove gmap rmap structures for segment table. */
231 for (i
= 0; i
< PTRS_PER_PMD
; i
++, table
++)
232 gmap_unlink_segment(gmap
, table
);
233 __free_pages(page
, ALLOC_ORDER
);
235 spin_unlock(&gmap
->mm
->page_table_lock
);
236 up_read(&gmap
->mm
->mmap_sem
);
237 list_del(&gmap
->list
);
240 EXPORT_SYMBOL_GPL(gmap_free
);
243 * gmap_enable - switch primary space to the guest address space
244 * @gmap: pointer to the guest address space structure
246 void gmap_enable(struct gmap
*gmap
)
248 S390_lowcore
.gmap
= (unsigned long) gmap
;
250 EXPORT_SYMBOL_GPL(gmap_enable
);
253 * gmap_disable - switch back to the standard primary address space
254 * @gmap: pointer to the guest address space structure
256 void gmap_disable(struct gmap
*gmap
)
258 S390_lowcore
.gmap
= 0UL;
260 EXPORT_SYMBOL_GPL(gmap_disable
);
263 * gmap_alloc_table is assumed to be called with mmap_sem held
265 static int gmap_alloc_table(struct gmap
*gmap
,
266 unsigned long *table
, unsigned long init
)
267 __releases(&gmap
->mm
->page_table_lock
)
268 __acquires(&gmap
->mm
->page_table_lock
)
273 /* since we dont free the gmap table until gmap_free we can unlock */
274 spin_unlock(&gmap
->mm
->page_table_lock
);
275 page
= alloc_pages(GFP_KERNEL
, ALLOC_ORDER
);
276 spin_lock(&gmap
->mm
->page_table_lock
);
279 new = (unsigned long *) page_to_phys(page
);
280 crst_table_init(new, init
);
281 if (*table
& _REGION_ENTRY_INVALID
) {
282 list_add(&page
->lru
, &gmap
->crst_list
);
283 *table
= (unsigned long) new | _REGION_ENTRY_LENGTH
|
284 (*table
& _REGION_ENTRY_TYPE_MASK
);
286 __free_pages(page
, ALLOC_ORDER
);
291 * gmap_unmap_segment - unmap segment from the guest address space
292 * @gmap: pointer to the guest address space structure
293 * @addr: address in the guest address space
294 * @len: length of the memory area to unmap
296 * Returns 0 if the unmap succeded, -EINVAL if not.
298 int gmap_unmap_segment(struct gmap
*gmap
, unsigned long to
, unsigned long len
)
300 unsigned long *table
;
304 if ((to
| len
) & (PMD_SIZE
- 1))
306 if (len
== 0 || to
+ len
< to
)
310 down_read(&gmap
->mm
->mmap_sem
);
311 spin_lock(&gmap
->mm
->page_table_lock
);
312 for (off
= 0; off
< len
; off
+= PMD_SIZE
) {
313 /* Walk the guest addr space page table */
314 table
= gmap
->table
+ (((to
+ off
) >> 53) & 0x7ff);
315 if (*table
& _REGION_ENTRY_INVALID
)
317 table
= (unsigned long *)(*table
& _REGION_ENTRY_ORIGIN
);
318 table
= table
+ (((to
+ off
) >> 42) & 0x7ff);
319 if (*table
& _REGION_ENTRY_INVALID
)
321 table
= (unsigned long *)(*table
& _REGION_ENTRY_ORIGIN
);
322 table
= table
+ (((to
+ off
) >> 31) & 0x7ff);
323 if (*table
& _REGION_ENTRY_INVALID
)
325 table
= (unsigned long *)(*table
& _REGION_ENTRY_ORIGIN
);
326 table
= table
+ (((to
+ off
) >> 20) & 0x7ff);
328 /* Clear segment table entry in guest address space. */
329 flush
|= gmap_unlink_segment(gmap
, table
);
330 *table
= _SEGMENT_ENTRY_INVALID
;
333 spin_unlock(&gmap
->mm
->page_table_lock
);
334 up_read(&gmap
->mm
->mmap_sem
);
336 gmap_flush_tlb(gmap
);
339 EXPORT_SYMBOL_GPL(gmap_unmap_segment
);
342 * gmap_mmap_segment - map a segment to the guest address space
343 * @gmap: pointer to the guest address space structure
344 * @from: source address in the parent address space
345 * @to: target address in the guest address space
347 * Returns 0 if the mmap succeded, -EINVAL or -ENOMEM if not.
349 int gmap_map_segment(struct gmap
*gmap
, unsigned long from
,
350 unsigned long to
, unsigned long len
)
352 unsigned long *table
;
356 if ((from
| to
| len
) & (PMD_SIZE
- 1))
358 if (len
== 0 || from
+ len
> TASK_MAX_SIZE
||
359 from
+ len
< from
|| to
+ len
< to
)
363 down_read(&gmap
->mm
->mmap_sem
);
364 spin_lock(&gmap
->mm
->page_table_lock
);
365 for (off
= 0; off
< len
; off
+= PMD_SIZE
) {
366 /* Walk the gmap address space page table */
367 table
= gmap
->table
+ (((to
+ off
) >> 53) & 0x7ff);
368 if ((*table
& _REGION_ENTRY_INVALID
) &&
369 gmap_alloc_table(gmap
, table
, _REGION2_ENTRY_EMPTY
))
371 table
= (unsigned long *)(*table
& _REGION_ENTRY_ORIGIN
);
372 table
= table
+ (((to
+ off
) >> 42) & 0x7ff);
373 if ((*table
& _REGION_ENTRY_INVALID
) &&
374 gmap_alloc_table(gmap
, table
, _REGION3_ENTRY_EMPTY
))
376 table
= (unsigned long *)(*table
& _REGION_ENTRY_ORIGIN
);
377 table
= table
+ (((to
+ off
) >> 31) & 0x7ff);
378 if ((*table
& _REGION_ENTRY_INVALID
) &&
379 gmap_alloc_table(gmap
, table
, _SEGMENT_ENTRY_EMPTY
))
381 table
= (unsigned long *) (*table
& _REGION_ENTRY_ORIGIN
);
382 table
= table
+ (((to
+ off
) >> 20) & 0x7ff);
384 /* Store 'from' address in an invalid segment table entry. */
385 flush
|= gmap_unlink_segment(gmap
, table
);
386 *table
= (from
+ off
) | (_SEGMENT_ENTRY_INVALID
|
387 _SEGMENT_ENTRY_PROTECT
);
389 spin_unlock(&gmap
->mm
->page_table_lock
);
390 up_read(&gmap
->mm
->mmap_sem
);
392 gmap_flush_tlb(gmap
);
396 spin_unlock(&gmap
->mm
->page_table_lock
);
397 up_read(&gmap
->mm
->mmap_sem
);
398 gmap_unmap_segment(gmap
, to
, len
);
401 EXPORT_SYMBOL_GPL(gmap_map_segment
);
403 static unsigned long *gmap_table_walk(unsigned long address
, struct gmap
*gmap
)
405 unsigned long *table
;
407 table
= gmap
->table
+ ((address
>> 53) & 0x7ff);
408 if (unlikely(*table
& _REGION_ENTRY_INVALID
))
409 return ERR_PTR(-EFAULT
);
410 table
= (unsigned long *)(*table
& _REGION_ENTRY_ORIGIN
);
411 table
= table
+ ((address
>> 42) & 0x7ff);
412 if (unlikely(*table
& _REGION_ENTRY_INVALID
))
413 return ERR_PTR(-EFAULT
);
414 table
= (unsigned long *)(*table
& _REGION_ENTRY_ORIGIN
);
415 table
= table
+ ((address
>> 31) & 0x7ff);
416 if (unlikely(*table
& _REGION_ENTRY_INVALID
))
417 return ERR_PTR(-EFAULT
);
418 table
= (unsigned long *)(*table
& _REGION_ENTRY_ORIGIN
);
419 table
= table
+ ((address
>> 20) & 0x7ff);
424 * __gmap_translate - translate a guest address to a user space address
425 * @address: guest address
426 * @gmap: pointer to guest mapping meta data structure
428 * Returns user space address which corresponds to the guest address or
429 * -EFAULT if no such mapping exists.
430 * This function does not establish potentially missing page table entries.
431 * The mmap_sem of the mm that belongs to the address space must be held
432 * when this function gets called.
434 unsigned long __gmap_translate(unsigned long address
, struct gmap
*gmap
)
436 unsigned long *segment_ptr
, vmaddr
, segment
;
437 struct gmap_pgtable
*mp
;
440 current
->thread
.gmap_addr
= address
;
441 segment_ptr
= gmap_table_walk(address
, gmap
);
442 if (IS_ERR(segment_ptr
))
443 return PTR_ERR(segment_ptr
);
444 /* Convert the gmap address to an mm address. */
445 segment
= *segment_ptr
;
446 if (!(segment
& _SEGMENT_ENTRY_INVALID
)) {
447 page
= pfn_to_page(segment
>> PAGE_SHIFT
);
448 mp
= (struct gmap_pgtable
*) page
->index
;
449 return mp
->vmaddr
| (address
& ~PMD_MASK
);
450 } else if (segment
& _SEGMENT_ENTRY_PROTECT
) {
451 vmaddr
= segment
& _SEGMENT_ENTRY_ORIGIN
;
452 return vmaddr
| (address
& ~PMD_MASK
);
456 EXPORT_SYMBOL_GPL(__gmap_translate
);
459 * gmap_translate - translate a guest address to a user space address
460 * @address: guest address
461 * @gmap: pointer to guest mapping meta data structure
463 * Returns user space address which corresponds to the guest address or
464 * -EFAULT if no such mapping exists.
465 * This function does not establish potentially missing page table entries.
467 unsigned long gmap_translate(unsigned long address
, struct gmap
*gmap
)
471 down_read(&gmap
->mm
->mmap_sem
);
472 rc
= __gmap_translate(address
, gmap
);
473 up_read(&gmap
->mm
->mmap_sem
);
476 EXPORT_SYMBOL_GPL(gmap_translate
);
478 static int gmap_connect_pgtable(unsigned long address
, unsigned long segment
,
479 unsigned long *segment_ptr
, struct gmap
*gmap
)
481 unsigned long vmaddr
;
482 struct vm_area_struct
*vma
;
483 struct gmap_pgtable
*mp
;
484 struct gmap_rmap
*rmap
;
485 struct mm_struct
*mm
;
492 vmaddr
= segment
& _SEGMENT_ENTRY_ORIGIN
;
493 vma
= find_vma(mm
, vmaddr
);
494 if (!vma
|| vma
->vm_start
> vmaddr
)
496 /* Walk the parent mm page table */
497 pgd
= pgd_offset(mm
, vmaddr
);
498 pud
= pud_alloc(mm
, pgd
, vmaddr
);
501 pmd
= pmd_alloc(mm
, pud
, vmaddr
);
504 if (!pmd_present(*pmd
) &&
505 __pte_alloc(mm
, vma
, pmd
, vmaddr
))
507 /* pmd now points to a valid segment table entry. */
508 rmap
= kmalloc(sizeof(*rmap
), GFP_KERNEL
|__GFP_REPEAT
);
511 /* Link gmap segment table entry location to page table. */
512 page
= pmd_page(*pmd
);
513 mp
= (struct gmap_pgtable
*) page
->index
;
515 rmap
->entry
= segment_ptr
;
516 rmap
->vmaddr
= address
& PMD_MASK
;
517 spin_lock(&mm
->page_table_lock
);
518 if (*segment_ptr
== segment
) {
519 list_add(&rmap
->list
, &mp
->mapper
);
520 /* Set gmap segment table entry to page table. */
521 *segment_ptr
= pmd_val(*pmd
) & PAGE_MASK
;
524 spin_unlock(&mm
->page_table_lock
);
529 static void gmap_disconnect_pgtable(struct mm_struct
*mm
, unsigned long *table
)
531 struct gmap_rmap
*rmap
, *next
;
532 struct gmap_pgtable
*mp
;
537 spin_lock(&mm
->page_table_lock
);
538 page
= pfn_to_page(__pa(table
) >> PAGE_SHIFT
);
539 mp
= (struct gmap_pgtable
*) page
->index
;
540 list_for_each_entry_safe(rmap
, next
, &mp
->mapper
, list
) {
541 *rmap
->entry
= mp
->vmaddr
| (_SEGMENT_ENTRY_INVALID
|
542 _SEGMENT_ENTRY_PROTECT
);
543 list_del(&rmap
->list
);
547 spin_unlock(&mm
->page_table_lock
);
549 __tlb_flush_global();
553 * this function is assumed to be called with mmap_sem held
555 unsigned long __gmap_fault(unsigned long address
, struct gmap
*gmap
)
557 unsigned long *segment_ptr
, segment
;
558 struct gmap_pgtable
*mp
;
562 current
->thread
.gmap_addr
= address
;
563 segment_ptr
= gmap_table_walk(address
, gmap
);
564 if (IS_ERR(segment_ptr
))
566 /* Convert the gmap address to an mm address. */
568 segment
= *segment_ptr
;
569 if (!(segment
& _SEGMENT_ENTRY_INVALID
)) {
570 /* Page table is present */
571 page
= pfn_to_page(segment
>> PAGE_SHIFT
);
572 mp
= (struct gmap_pgtable
*) page
->index
;
573 return mp
->vmaddr
| (address
& ~PMD_MASK
);
575 if (!(segment
& _SEGMENT_ENTRY_PROTECT
))
576 /* Nothing mapped in the gmap address space. */
578 rc
= gmap_connect_pgtable(address
, segment
, segment_ptr
, gmap
);
585 unsigned long gmap_fault(unsigned long address
, struct gmap
*gmap
)
589 down_read(&gmap
->mm
->mmap_sem
);
590 rc
= __gmap_fault(address
, gmap
);
591 up_read(&gmap
->mm
->mmap_sem
);
595 EXPORT_SYMBOL_GPL(gmap_fault
);
597 void gmap_discard(unsigned long from
, unsigned long to
, struct gmap
*gmap
)
600 unsigned long *table
, address
, size
;
601 struct vm_area_struct
*vma
;
602 struct gmap_pgtable
*mp
;
605 down_read(&gmap
->mm
->mmap_sem
);
607 while (address
< to
) {
608 /* Walk the gmap address space page table */
609 table
= gmap
->table
+ ((address
>> 53) & 0x7ff);
610 if (unlikely(*table
& _REGION_ENTRY_INVALID
)) {
611 address
= (address
+ PMD_SIZE
) & PMD_MASK
;
614 table
= (unsigned long *)(*table
& _REGION_ENTRY_ORIGIN
);
615 table
= table
+ ((address
>> 42) & 0x7ff);
616 if (unlikely(*table
& _REGION_ENTRY_INVALID
)) {
617 address
= (address
+ PMD_SIZE
) & PMD_MASK
;
620 table
= (unsigned long *)(*table
& _REGION_ENTRY_ORIGIN
);
621 table
= table
+ ((address
>> 31) & 0x7ff);
622 if (unlikely(*table
& _REGION_ENTRY_INVALID
)) {
623 address
= (address
+ PMD_SIZE
) & PMD_MASK
;
626 table
= (unsigned long *)(*table
& _REGION_ENTRY_ORIGIN
);
627 table
= table
+ ((address
>> 20) & 0x7ff);
628 if (unlikely(*table
& _SEGMENT_ENTRY_INVALID
)) {
629 address
= (address
+ PMD_SIZE
) & PMD_MASK
;
632 page
= pfn_to_page(*table
>> PAGE_SHIFT
);
633 mp
= (struct gmap_pgtable
*) page
->index
;
634 vma
= find_vma(gmap
->mm
, mp
->vmaddr
);
635 size
= min(to
- address
, PMD_SIZE
- (address
& ~PMD_MASK
));
636 zap_page_range(vma
, mp
->vmaddr
| (address
& ~PMD_MASK
),
638 address
= (address
+ PMD_SIZE
) & PMD_MASK
;
640 up_read(&gmap
->mm
->mmap_sem
);
642 EXPORT_SYMBOL_GPL(gmap_discard
);
644 static LIST_HEAD(gmap_notifier_list
);
645 static DEFINE_SPINLOCK(gmap_notifier_lock
);
648 * gmap_register_ipte_notifier - register a pte invalidation callback
649 * @nb: pointer to the gmap notifier block
651 void gmap_register_ipte_notifier(struct gmap_notifier
*nb
)
653 spin_lock(&gmap_notifier_lock
);
654 list_add(&nb
->list
, &gmap_notifier_list
);
655 spin_unlock(&gmap_notifier_lock
);
657 EXPORT_SYMBOL_GPL(gmap_register_ipte_notifier
);
660 * gmap_unregister_ipte_notifier - remove a pte invalidation callback
661 * @nb: pointer to the gmap notifier block
663 void gmap_unregister_ipte_notifier(struct gmap_notifier
*nb
)
665 spin_lock(&gmap_notifier_lock
);
666 list_del_init(&nb
->list
);
667 spin_unlock(&gmap_notifier_lock
);
669 EXPORT_SYMBOL_GPL(gmap_unregister_ipte_notifier
);
672 * gmap_ipte_notify - mark a range of ptes for invalidation notification
673 * @gmap: pointer to guest mapping meta data structure
674 * @address: virtual address in the guest address space
677 * Returns 0 if for each page in the given range a gmap mapping exists and
678 * the invalidation notification could be set. If the gmap mapping is missing
679 * for one or more pages -EFAULT is returned. If no memory could be allocated
680 * -ENOMEM is returned. This function establishes missing page table entries.
682 int gmap_ipte_notify(struct gmap
*gmap
, unsigned long start
, unsigned long len
)
690 if ((start
& ~PAGE_MASK
) || (len
& ~PAGE_MASK
))
692 down_read(&gmap
->mm
->mmap_sem
);
694 /* Convert gmap address and connect the page tables */
695 addr
= __gmap_fault(start
, gmap
);
696 if (IS_ERR_VALUE(addr
)) {
700 /* Get the page mapped */
701 if (fixup_user_fault(current
, gmap
->mm
, addr
, FAULT_FLAG_WRITE
)) {
705 /* Walk the process page table, lock and get pte pointer */
706 ptep
= get_locked_pte(gmap
->mm
, addr
, &ptl
);
709 /* Set notification bit in the pgste of the pte */
711 if ((pte_val(entry
) & (_PAGE_INVALID
| _PAGE_PROTECT
)) == 0) {
712 pgste
= pgste_get_lock(ptep
);
713 pgste_val(pgste
) |= PGSTE_IN_BIT
;
714 pgste_set_unlock(ptep
, pgste
);
720 up_read(&gmap
->mm
->mmap_sem
);
723 EXPORT_SYMBOL_GPL(gmap_ipte_notify
);
726 * gmap_do_ipte_notify - call all invalidation callbacks for a specific pte.
727 * @mm: pointer to the process mm_struct
728 * @addr: virtual address in the process address space
729 * @pte: pointer to the page table entry
731 * This function is assumed to be called with the page table lock held
732 * for the pte to notify.
734 void gmap_do_ipte_notify(struct mm_struct
*mm
, unsigned long addr
, pte_t
*pte
)
736 unsigned long segment_offset
;
737 struct gmap_notifier
*nb
;
738 struct gmap_pgtable
*mp
;
739 struct gmap_rmap
*rmap
;
742 segment_offset
= ((unsigned long) pte
) & (255 * sizeof(pte_t
));
743 segment_offset
= segment_offset
* (4096 / sizeof(pte_t
));
744 page
= pfn_to_page(__pa(pte
) >> PAGE_SHIFT
);
745 mp
= (struct gmap_pgtable
*) page
->index
;
746 spin_lock(&gmap_notifier_lock
);
747 list_for_each_entry(rmap
, &mp
->mapper
, list
) {
748 list_for_each_entry(nb
, &gmap_notifier_list
, list
)
749 nb
->notifier_call(rmap
->gmap
,
750 rmap
->vmaddr
+ segment_offset
);
752 spin_unlock(&gmap_notifier_lock
);
755 static inline int page_table_with_pgste(struct page
*page
)
757 return atomic_read(&page
->_mapcount
) == 0;
760 static inline unsigned long *page_table_alloc_pgste(struct mm_struct
*mm
,
761 unsigned long vmaddr
)
764 unsigned long *table
;
765 struct gmap_pgtable
*mp
;
767 page
= alloc_page(GFP_KERNEL
|__GFP_REPEAT
);
770 mp
= kmalloc(sizeof(*mp
), GFP_KERNEL
|__GFP_REPEAT
);
775 pgtable_page_ctor(page
);
776 mp
->vmaddr
= vmaddr
& PMD_MASK
;
777 INIT_LIST_HEAD(&mp
->mapper
);
778 page
->index
= (unsigned long) mp
;
779 atomic_set(&page
->_mapcount
, 0);
780 table
= (unsigned long *) page_to_phys(page
);
781 clear_table(table
, _PAGE_INVALID
, PAGE_SIZE
/2);
782 clear_table(table
+ PTRS_PER_PTE
, PGSTE_HR_BIT
| PGSTE_HC_BIT
,
787 static inline void page_table_free_pgste(unsigned long *table
)
790 struct gmap_pgtable
*mp
;
792 page
= pfn_to_page(__pa(table
) >> PAGE_SHIFT
);
793 mp
= (struct gmap_pgtable
*) page
->index
;
794 BUG_ON(!list_empty(&mp
->mapper
));
795 pgtable_page_dtor(page
);
796 atomic_set(&page
->_mapcount
, -1);
801 int set_guest_storage_key(struct mm_struct
*mm
, unsigned long addr
,
802 unsigned long key
, bool nq
)
808 down_read(&mm
->mmap_sem
);
809 ptep
= get_locked_pte(current
->mm
, addr
, &ptl
);
810 if (unlikely(!ptep
)) {
811 up_read(&mm
->mmap_sem
);
815 new = old
= pgste_get_lock(ptep
);
816 pgste_val(new) &= ~(PGSTE_GR_BIT
| PGSTE_GC_BIT
|
817 PGSTE_ACC_BITS
| PGSTE_FP_BIT
);
818 pgste_val(new) |= (key
& (_PAGE_CHANGED
| _PAGE_REFERENCED
)) << 48;
819 pgste_val(new) |= (key
& (_PAGE_ACC_BITS
| _PAGE_FP_BIT
)) << 56;
820 if (!(pte_val(*ptep
) & _PAGE_INVALID
)) {
821 unsigned long address
, bits
, skey
;
823 address
= pte_val(*ptep
) & PAGE_MASK
;
824 skey
= (unsigned long) page_get_storage_key(address
);
825 bits
= skey
& (_PAGE_CHANGED
| _PAGE_REFERENCED
);
826 skey
= key
& (_PAGE_ACC_BITS
| _PAGE_FP_BIT
);
827 /* Set storage key ACC and FP */
828 page_set_storage_key(address
, skey
, !nq
);
829 /* Merge host changed & referenced into pgste */
830 pgste_val(new) |= bits
<< 52;
832 /* changing the guest storage key is considered a change of the page */
833 if ((pgste_val(new) ^ pgste_val(old
)) &
834 (PGSTE_ACC_BITS
| PGSTE_FP_BIT
| PGSTE_GR_BIT
| PGSTE_GC_BIT
))
835 pgste_val(new) |= PGSTE_HC_BIT
;
837 pgste_set_unlock(ptep
, new);
838 pte_unmap_unlock(*ptep
, ptl
);
839 up_read(&mm
->mmap_sem
);
842 EXPORT_SYMBOL(set_guest_storage_key
);
844 #else /* CONFIG_PGSTE */
846 static inline int page_table_with_pgste(struct page
*page
)
851 static inline unsigned long *page_table_alloc_pgste(struct mm_struct
*mm
,
852 unsigned long vmaddr
)
857 static inline void page_table_free_pgste(unsigned long *table
)
861 static inline void gmap_disconnect_pgtable(struct mm_struct
*mm
,
862 unsigned long *table
)
866 #endif /* CONFIG_PGSTE */
868 static inline unsigned int atomic_xor_bits(atomic_t
*v
, unsigned int bits
)
870 unsigned int old
, new;
873 old
= atomic_read(v
);
875 } while (atomic_cmpxchg(v
, old
, new) != old
);
880 * page table entry allocation/free routines.
882 unsigned long *page_table_alloc(struct mm_struct
*mm
, unsigned long vmaddr
)
884 unsigned long *uninitialized_var(table
);
885 struct page
*uninitialized_var(page
);
886 unsigned int mask
, bit
;
888 if (mm_has_pgste(mm
))
889 return page_table_alloc_pgste(mm
, vmaddr
);
890 /* Allocate fragments of a 4K page as 1K/2K page table */
891 spin_lock_bh(&mm
->context
.list_lock
);
893 if (!list_empty(&mm
->context
.pgtable_list
)) {
894 page
= list_first_entry(&mm
->context
.pgtable_list
,
896 table
= (unsigned long *) page_to_phys(page
);
897 mask
= atomic_read(&page
->_mapcount
);
898 mask
= mask
| (mask
>> 4);
900 if ((mask
& FRAG_MASK
) == FRAG_MASK
) {
901 spin_unlock_bh(&mm
->context
.list_lock
);
902 page
= alloc_page(GFP_KERNEL
|__GFP_REPEAT
);
905 pgtable_page_ctor(page
);
906 atomic_set(&page
->_mapcount
, 1);
907 table
= (unsigned long *) page_to_phys(page
);
908 clear_table(table
, _PAGE_INVALID
, PAGE_SIZE
);
909 spin_lock_bh(&mm
->context
.list_lock
);
910 list_add(&page
->lru
, &mm
->context
.pgtable_list
);
912 for (bit
= 1; mask
& bit
; bit
<<= 1)
913 table
+= PTRS_PER_PTE
;
914 mask
= atomic_xor_bits(&page
->_mapcount
, bit
);
915 if ((mask
& FRAG_MASK
) == FRAG_MASK
)
916 list_del(&page
->lru
);
918 spin_unlock_bh(&mm
->context
.list_lock
);
922 void page_table_free(struct mm_struct
*mm
, unsigned long *table
)
925 unsigned int bit
, mask
;
927 page
= pfn_to_page(__pa(table
) >> PAGE_SHIFT
);
928 if (page_table_with_pgste(page
)) {
929 gmap_disconnect_pgtable(mm
, table
);
930 return page_table_free_pgste(table
);
932 /* Free 1K/2K page table fragment of a 4K page */
933 bit
= 1 << ((__pa(table
) & ~PAGE_MASK
)/(PTRS_PER_PTE
*sizeof(pte_t
)));
934 spin_lock_bh(&mm
->context
.list_lock
);
935 if ((atomic_read(&page
->_mapcount
) & FRAG_MASK
) != FRAG_MASK
)
936 list_del(&page
->lru
);
937 mask
= atomic_xor_bits(&page
->_mapcount
, bit
);
938 if (mask
& FRAG_MASK
)
939 list_add(&page
->lru
, &mm
->context
.pgtable_list
);
940 spin_unlock_bh(&mm
->context
.list_lock
);
942 pgtable_page_dtor(page
);
943 atomic_set(&page
->_mapcount
, -1);
948 static void __page_table_free_rcu(void *table
, unsigned bit
)
952 if (bit
== FRAG_MASK
)
953 return page_table_free_pgste(table
);
954 /* Free 1K/2K page table fragment of a 4K page */
955 page
= pfn_to_page(__pa(table
) >> PAGE_SHIFT
);
956 if (atomic_xor_bits(&page
->_mapcount
, bit
) == 0) {
957 pgtable_page_dtor(page
);
958 atomic_set(&page
->_mapcount
, -1);
963 void page_table_free_rcu(struct mmu_gather
*tlb
, unsigned long *table
)
965 struct mm_struct
*mm
;
967 unsigned int bit
, mask
;
970 page
= pfn_to_page(__pa(table
) >> PAGE_SHIFT
);
971 if (page_table_with_pgste(page
)) {
972 gmap_disconnect_pgtable(mm
, table
);
973 table
= (unsigned long *) (__pa(table
) | FRAG_MASK
);
974 tlb_remove_table(tlb
, table
);
977 bit
= 1 << ((__pa(table
) & ~PAGE_MASK
) / (PTRS_PER_PTE
*sizeof(pte_t
)));
978 spin_lock_bh(&mm
->context
.list_lock
);
979 if ((atomic_read(&page
->_mapcount
) & FRAG_MASK
) != FRAG_MASK
)
980 list_del(&page
->lru
);
981 mask
= atomic_xor_bits(&page
->_mapcount
, bit
| (bit
<< 4));
982 if (mask
& FRAG_MASK
)
983 list_add_tail(&page
->lru
, &mm
->context
.pgtable_list
);
984 spin_unlock_bh(&mm
->context
.list_lock
);
985 table
= (unsigned long *) (__pa(table
) | (bit
<< 4));
986 tlb_remove_table(tlb
, table
);
989 static void __tlb_remove_table(void *_table
)
991 const unsigned long mask
= (FRAG_MASK
<< 4) | FRAG_MASK
;
992 void *table
= (void *)((unsigned long) _table
& ~mask
);
993 unsigned type
= (unsigned long) _table
& mask
;
996 __page_table_free_rcu(table
, type
);
998 free_pages((unsigned long) table
, ALLOC_ORDER
);
1001 static void tlb_remove_table_smp_sync(void *arg
)
1003 /* Simply deliver the interrupt */
1006 static void tlb_remove_table_one(void *table
)
1009 * This isn't an RCU grace period and hence the page-tables cannot be
1010 * assumed to be actually RCU-freed.
1012 * It is however sufficient for software page-table walkers that rely
1013 * on IRQ disabling. See the comment near struct mmu_table_batch.
1015 smp_call_function(tlb_remove_table_smp_sync
, NULL
, 1);
1016 __tlb_remove_table(table
);
1019 static void tlb_remove_table_rcu(struct rcu_head
*head
)
1021 struct mmu_table_batch
*batch
;
1024 batch
= container_of(head
, struct mmu_table_batch
, rcu
);
1026 for (i
= 0; i
< batch
->nr
; i
++)
1027 __tlb_remove_table(batch
->tables
[i
]);
1029 free_page((unsigned long)batch
);
1032 void tlb_table_flush(struct mmu_gather
*tlb
)
1034 struct mmu_table_batch
**batch
= &tlb
->batch
;
1037 call_rcu_sched(&(*batch
)->rcu
, tlb_remove_table_rcu
);
1042 void tlb_remove_table(struct mmu_gather
*tlb
, void *table
)
1044 struct mmu_table_batch
**batch
= &tlb
->batch
;
1046 tlb
->mm
->context
.flush_mm
= 1;
1047 if (*batch
== NULL
) {
1048 *batch
= (struct mmu_table_batch
*)
1049 __get_free_page(GFP_NOWAIT
| __GFP_NOWARN
);
1050 if (*batch
== NULL
) {
1051 __tlb_flush_mm_lazy(tlb
->mm
);
1052 tlb_remove_table_one(table
);
1057 (*batch
)->tables
[(*batch
)->nr
++] = table
;
1058 if ((*batch
)->nr
== MAX_TABLE_BATCH
)
1062 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
1063 static inline void thp_split_vma(struct vm_area_struct
*vma
)
1067 for (addr
= vma
->vm_start
; addr
< vma
->vm_end
; addr
+= PAGE_SIZE
)
1068 follow_page(vma
, addr
, FOLL_SPLIT
);
1071 static inline void thp_split_mm(struct mm_struct
*mm
)
1073 struct vm_area_struct
*vma
;
1075 for (vma
= mm
->mmap
; vma
!= NULL
; vma
= vma
->vm_next
) {
1077 vma
->vm_flags
&= ~VM_HUGEPAGE
;
1078 vma
->vm_flags
|= VM_NOHUGEPAGE
;
1080 mm
->def_flags
|= VM_NOHUGEPAGE
;
1083 static inline void thp_split_mm(struct mm_struct
*mm
)
1086 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
1088 static unsigned long page_table_realloc_pmd(struct mmu_gather
*tlb
,
1089 struct mm_struct
*mm
, pud_t
*pud
,
1090 unsigned long addr
, unsigned long end
)
1092 unsigned long next
, *table
, *new;
1096 pmd
= pmd_offset(pud
, addr
);
1098 next
= pmd_addr_end(addr
, end
);
1100 if (pmd_none_or_clear_bad(pmd
))
1102 table
= (unsigned long *) pmd_deref(*pmd
);
1103 page
= pfn_to_page(__pa(table
) >> PAGE_SHIFT
);
1104 if (page_table_with_pgste(page
))
1106 /* Allocate new page table with pgstes */
1107 new = page_table_alloc_pgste(mm
, addr
);
1111 spin_lock(&mm
->page_table_lock
);
1112 if (likely((unsigned long *) pmd_deref(*pmd
) == table
)) {
1113 /* Nuke pmd entry pointing to the "short" page table */
1114 pmdp_flush_lazy(mm
, addr
, pmd
);
1116 /* Copy ptes from old table to new table */
1117 memcpy(new, table
, PAGE_SIZE
/2);
1118 clear_table(table
, _PAGE_INVALID
, PAGE_SIZE
/2);
1119 /* Establish new table */
1120 pmd_populate(mm
, pmd
, (pte_t
*) new);
1121 /* Free old table with rcu, there might be a walker! */
1122 page_table_free_rcu(tlb
, table
);
1125 spin_unlock(&mm
->page_table_lock
);
1127 page_table_free_pgste(new);
1130 } while (pmd
++, addr
= next
, addr
!= end
);
1135 static unsigned long page_table_realloc_pud(struct mmu_gather
*tlb
,
1136 struct mm_struct
*mm
, pgd_t
*pgd
,
1137 unsigned long addr
, unsigned long end
)
1142 pud
= pud_offset(pgd
, addr
);
1144 next
= pud_addr_end(addr
, end
);
1145 if (pud_none_or_clear_bad(pud
))
1147 next
= page_table_realloc_pmd(tlb
, mm
, pud
, addr
, next
);
1148 if (unlikely(IS_ERR_VALUE(next
)))
1150 } while (pud
++, addr
= next
, addr
!= end
);
1155 static unsigned long page_table_realloc(struct mmu_gather
*tlb
, struct mm_struct
*mm
,
1156 unsigned long addr
, unsigned long end
)
1161 pgd
= pgd_offset(mm
, addr
);
1163 next
= pgd_addr_end(addr
, end
);
1164 if (pgd_none_or_clear_bad(pgd
))
1166 next
= page_table_realloc_pud(tlb
, mm
, pgd
, addr
, next
);
1167 if (unlikely(IS_ERR_VALUE(next
)))
1169 } while (pgd
++, addr
= next
, addr
!= end
);
1175 * switch on pgstes for its userspace process (for kvm)
1177 int s390_enable_sie(void)
1179 struct task_struct
*tsk
= current
;
1180 struct mm_struct
*mm
= tsk
->mm
;
1181 struct mmu_gather tlb
;
1183 /* Do we have pgstes? if yes, we are done */
1184 if (mm_has_pgste(tsk
->mm
))
1187 down_write(&mm
->mmap_sem
);
1188 /* split thp mappings and disable thp for future mappings */
1190 /* Reallocate the page tables with pgstes */
1191 tlb_gather_mmu(&tlb
, mm
, 0, TASK_SIZE
);
1192 if (!page_table_realloc(&tlb
, mm
, 0, TASK_SIZE
))
1193 mm
->context
.has_pgste
= 1;
1194 tlb_finish_mmu(&tlb
, 0, TASK_SIZE
);
1195 up_write(&mm
->mmap_sem
);
1196 return mm
->context
.has_pgste
? 0 : -ENOMEM
;
1198 EXPORT_SYMBOL_GPL(s390_enable_sie
);
1200 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
1201 int pmdp_clear_flush_young(struct vm_area_struct
*vma
, unsigned long address
,
1204 VM_BUG_ON(address
& ~HPAGE_PMD_MASK
);
1205 /* No need to flush TLB
1206 * On s390 reference bits are in storage key and never in TLB */
1207 return pmdp_test_and_clear_young(vma
, address
, pmdp
);
1210 int pmdp_set_access_flags(struct vm_area_struct
*vma
,
1211 unsigned long address
, pmd_t
*pmdp
,
1212 pmd_t entry
, int dirty
)
1214 VM_BUG_ON(address
& ~HPAGE_PMD_MASK
);
1216 if (pmd_same(*pmdp
, entry
))
1218 pmdp_invalidate(vma
, address
, pmdp
);
1219 set_pmd_at(vma
->vm_mm
, address
, pmdp
, entry
);
1223 static void pmdp_splitting_flush_sync(void *arg
)
1225 /* Simply deliver the interrupt */
1228 void pmdp_splitting_flush(struct vm_area_struct
*vma
, unsigned long address
,
1231 VM_BUG_ON(address
& ~HPAGE_PMD_MASK
);
1232 if (!test_and_set_bit(_SEGMENT_ENTRY_SPLIT_BIT
,
1233 (unsigned long *) pmdp
)) {
1234 /* need to serialize against gup-fast (IRQ disabled) */
1235 smp_call_function(pmdp_splitting_flush_sync
, NULL
, 1);
1239 void pgtable_trans_huge_deposit(struct mm_struct
*mm
, pmd_t
*pmdp
,
1242 struct list_head
*lh
= (struct list_head
*) pgtable
;
1244 assert_spin_locked(&mm
->page_table_lock
);
1247 if (!mm
->pmd_huge_pte
)
1250 list_add(lh
, (struct list_head
*) mm
->pmd_huge_pte
);
1251 mm
->pmd_huge_pte
= pgtable
;
1254 pgtable_t
pgtable_trans_huge_withdraw(struct mm_struct
*mm
, pmd_t
*pmdp
)
1256 struct list_head
*lh
;
1260 assert_spin_locked(&mm
->page_table_lock
);
1263 pgtable
= mm
->pmd_huge_pte
;
1264 lh
= (struct list_head
*) pgtable
;
1266 mm
->pmd_huge_pte
= NULL
;
1268 mm
->pmd_huge_pte
= (pgtable_t
) lh
->next
;
1271 ptep
= (pte_t
*) pgtable
;
1272 pte_val(*ptep
) = _PAGE_INVALID
;
1274 pte_val(*ptep
) = _PAGE_INVALID
;
1277 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */