drivers/vfio/vfio_iommu_type1.c

   1 /*
   2  * VFIO: IOMMU DMA mapping support for Type1 IOMMU
   3  *
   4  * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
   5  *     Author: Alex Williamson <alex.williamson@redhat.com>
   6  *
   7  * This program is free software; you can redistribute it and/or modify
   8  * it under the terms of the GNU General Public License version 2 as
   9  * published by the Free Software Foundation.
  10  *
  11  * Derived from original vfio:
  12  * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
  13  * Author: Tom Lyon, pugs@cisco.com
  14  *
  15  * We arbitrarily define a Type1 IOMMU as one matching the below code.
  16  * It could be called the x86 IOMMU as it's designed for AMD-Vi & Intel
  17  * VT-d, but that makes it harder to re-use as theoretically anyone
  18  * implementing a similar IOMMU could make use of this.  We expect the
  19  * IOMMU to support the IOMMU API and have few to no restrictions around
  20  * the IOVA range that can be mapped.  The Type1 IOMMU is currently
  21  * optimized for relatively static mappings of a userspace process with
  22  * userpsace pages pinned into memory.  We also assume devices and IOMMU
  23  * domains are PCI based as the IOMMU API is still centered around a
  24  * device/bus interface rather than a group interface.
  25  */
  26
  27 #include <linux/compat.h>
  28 #include <linux/device.h>
  29 #include <linux/fs.h>
  30 #include <linux/iommu.h>
  31 #include <linux/module.h>
  32 #include <linux/mm.h>
  33 #include <linux/pci.h>          /* pci_bus_type */
  34 #include <linux/rbtree.h>
  35 #include <linux/sched.h>
  36 #include <linux/slab.h>
  37 #include <linux/uaccess.h>
  38 #include <linux/vfio.h>
  39 #include <linux/workqueue.h>
  40
  41 #define DRIVER_VERSION  "0.2"
  42 #define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
  43 #define DRIVER_DESC     "Type1 IOMMU driver for VFIO"
  44
  45 static bool allow_unsafe_interrupts;
  46 module_param_named(allow_unsafe_interrupts,
  47                    allow_unsafe_interrupts, bool, S_IRUGO | S_IWUSR);
  48 MODULE_PARM_DESC(allow_unsafe_interrupts,
  49                  "Enable VFIO IOMMU support for on platforms without interrupt remapping support.");
  50
  51 static bool disable_hugepages;
  52 module_param_named(disable_hugepages,
  53                    disable_hugepages, bool, S_IRUGO | S_IWUSR);
  54 MODULE_PARM_DESC(disable_hugepages,
  55                  "Disable VFIO IOMMU support for IOMMU hugepages.");
  56
  57 struct vfio_iommu {
  58         struct iommu_domain     *domain;
  59         struct mutex            lock;
  60         struct rb_root          dma_list;
  61         struct list_head        group_list;
  62         bool                    cache;
  63 };
  64
  65 struct vfio_dma {
  66         struct rb_node          node;
  67         dma_addr_t              iova;           /* Device address */
  68         unsigned long           vaddr;          /* Process virtual addr */
  69         size_t                  size;           /* Map size (bytes) */
  70         int                     prot;           /* IOMMU_READ/WRITE */
  71 };
  72
  73 struct vfio_group {
  74         struct iommu_group      *iommu_group;
  75         struct list_head        next;
  76 };
  77
  78 /*
  79  * This code handles mapping and unmapping of user data buffers
  80  * into DMA'ble space using the IOMMU
  81  */
  82
  83 static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu,
  84                                       dma_addr_t start, size_t size)
  85 {
  86         struct rb_node *node = iommu->dma_list.rb_node;
  87
  88         while (node) {
  89                 struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node);
  90
  91                 if (start + size <= dma->iova)
  92                         node = node->rb_left;
  93                 else if (start >= dma->iova + dma->size)
  94                         node = node->rb_right;
  95                 else
  96                         return dma;
  97         }
  98
  99         return NULL;
 100 }
 101
 102 static void vfio_insert_dma(struct vfio_iommu *iommu, struct vfio_dma *new)
 103 {
 104         struct rb_node **link = &iommu->dma_list.rb_node, *parent = NULL;
 105         struct vfio_dma *dma;
 106
 107         while (*link) {
 108                 parent = *link;
 109                 dma = rb_entry(parent, struct vfio_dma, node);
 110
 111                 if (new->iova + new->size <= dma->iova)
 112                         link = &(*link)->rb_left;
 113                 else
 114                         link = &(*link)->rb_right;
 115         }
 116
 117         rb_link_node(&new->node, parent, link);
 118         rb_insert_color(&new->node, &iommu->dma_list);
 119 }
 120
 121 static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
 122 {
 123         rb_erase(&old->node, &iommu->dma_list);
 124 }
 125
 126 struct vwork {
 127         struct mm_struct        *mm;
 128         long                    npage;
 129         struct work_struct      work;
 130 };
 131
 132 /* delayed decrement/increment for locked_vm */
 133 static void vfio_lock_acct_bg(struct work_struct *work)
 134 {
 135         struct vwork *vwork = container_of(work, struct vwork, work);
 136         struct mm_struct *mm;
 137
 138         mm = vwork->mm;
 139         down_write(&mm->mmap_sem);
 140         mm->locked_vm += vwork->npage;
 141         up_write(&mm->mmap_sem);
 142         mmput(mm);
 143         kfree(vwork);
 144 }
 145
 146 static void vfio_lock_acct(long npage)
 147 {
 148         struct vwork *vwork;
 149         struct mm_struct *mm;
 150
 151         if (!current->mm || !npage)
 152                 return; /* process exited or nothing to do */
 153
 154         if (down_write_trylock(&current->mm->mmap_sem)) {
 155                 current->mm->locked_vm += npage;
 156                 up_write(&current->mm->mmap_sem);
 157                 return;
 158         }
 159
 160         /*
 161          * Couldn't get mmap_sem lock, so must setup to update
 162          * mm->locked_vm later. If locked_vm were atomic, we
 163          * wouldn't need this silliness
 164          */
 165         vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL);
 166         if (!vwork)
 167                 return;
 168         mm = get_task_mm(current);
 169         if (!mm) {
 170                 kfree(vwork);
 171                 return;
 172         }
 173         INIT_WORK(&vwork->work, vfio_lock_acct_bg);
 174         vwork->mm = mm;
 175         vwork->npage = npage;
 176         schedule_work(&vwork->work);
 177 }
 178
 179 /*
 180  * Some mappings aren't backed by a struct page, for example an mmap'd
 181  * MMIO range for our own or another device.  These use a different
 182  * pfn conversion and shouldn't be tracked as locked pages.
 183  */
 184 static bool is_invalid_reserved_pfn(unsigned long pfn)
 185 {
 186         if (pfn_valid(pfn)) {
 187                 bool reserved;
 188                 struct page *tail = pfn_to_page(pfn);
 189                 struct page *head = compound_trans_head(tail);
 190                 reserved = !!(PageReserved(head));
 191                 if (head != tail) {
 192                         /*
 193                          * "head" is not a dangling pointer
 194                          * (compound_trans_head takes care of that)
 195                          * but the hugepage may have been split
 196                          * from under us (and we may not hold a
 197                          * reference count on the head page so it can
 198                          * be reused before we run PageReferenced), so
 199                          * we've to check PageTail before returning
 200                          * what we just read.
 201                          */
 202                         smp_rmb();
 203                         if (PageTail(tail))
 204                                 return reserved;
 205                 }
 206                 return PageReserved(tail);
 207         }
 208
 209         return true;
 210 }
 211
 212 static int put_pfn(unsigned long pfn, int prot)
 213 {
 214         if (!is_invalid_reserved_pfn(pfn)) {
 215                 struct page *page = pfn_to_page(pfn);
 216                 if (prot & IOMMU_WRITE)
 217                         SetPageDirty(page);
 218                 put_page(page);
 219                 return 1;
 220         }
 221         return 0;
 222 }
 223
 224 static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
 225 {
 226         struct page *page[1];
 227         struct vm_area_struct *vma;
 228         int ret = -EFAULT;
 229
 230         if (get_user_pages_fast(vaddr, 1, !!(prot & IOMMU_WRITE), page) == 1) {
 231                 *pfn = page_to_pfn(page[0]);
 232                 return 0;
 233         }
 234
 235         down_read(&current->mm->mmap_sem);
 236
 237         vma = find_vma_intersection(current->mm, vaddr, vaddr + 1);
 238
 239         if (vma && vma->vm_flags & VM_PFNMAP) {
 240                 *pfn = ((vaddr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
 241                 if (is_invalid_reserved_pfn(*pfn))
 242                         ret = 0;
 243         }
 244
 245         up_read(&current->mm->mmap_sem);
 246
 247         return ret;
 248 }
 249
 250 /*
 251  * Attempt to pin pages.  We really don't want to track all the pfns and
 252  * the iommu can only map chunks of consecutive pfns anyway, so get the
 253  * first page and all consecutive pages with the same locking.
 254  */
 255 static long vfio_pin_pages(unsigned long vaddr, long npage,
 256                            int prot, unsigned long *pfn_base)
 257 {
 258         unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
 259         bool lock_cap = capable(CAP_IPC_LOCK);
 260         long ret, i;
 261
 262         if (!current->mm)
 263                 return -ENODEV;
 264
 265         ret = vaddr_get_pfn(vaddr, prot, pfn_base);
 266         if (ret)
 267                 return ret;
 268
 269         if (is_invalid_reserved_pfn(*pfn_base))
 270                 return 1;
 271
 272         if (!lock_cap && current->mm->locked_vm + 1 > limit) {
 273                 put_pfn(*pfn_base, prot);
 274                 pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
 275                         limit << PAGE_SHIFT);
 276                 return -ENOMEM;
 277         }
 278
 279         if (unlikely(disable_hugepages)) {
 280                 vfio_lock_acct(1);
 281                 return 1;
 282         }
 283
 284         /* Lock all the consecutive pages from pfn_base */
 285         for (i = 1, vaddr += PAGE_SIZE; i < npage; i++, vaddr += PAGE_SIZE) {
 286                 unsigned long pfn = 0;
 287
 288                 ret = vaddr_get_pfn(vaddr, prot, &pfn);
 289                 if (ret)
 290                         break;
 291
 292                 if (pfn != *pfn_base + i || is_invalid_reserved_pfn(pfn)) {
 293                         put_pfn(pfn, prot);
 294                         break;
 295                 }
 296
 297                 if (!lock_cap && current->mm->locked_vm + i + 1 > limit) {
 298                         put_pfn(pfn, prot);
 299                         pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
 300                                 __func__, limit << PAGE_SHIFT);
 301                         break;
 302                 }
 303         }
 304
 305         vfio_lock_acct(i);
 306
 307         return i;
 308 }
 309
 310 static long vfio_unpin_pages(unsigned long pfn, long npage,
 311                              int prot, bool do_accounting)
 312 {
 313         unsigned long unlocked = 0;
 314         long i;
 315
 316         for (i = 0; i < npage; i++)
 317                 unlocked += put_pfn(pfn++, prot);
 318
 319         if (do_accounting)
 320                 vfio_lock_acct(-unlocked);
 321
 322         return unlocked;
 323 }
 324
 325 static int vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
 326                             dma_addr_t iova, size_t *size)
 327 {
 328         dma_addr_t start = iova, end = iova + *size;
 329         long unlocked = 0;
 330
 331         while (iova < end) {
 332                 size_t unmapped;
 333                 phys_addr_t phys;
 334
 335                 /*
 336                  * We use the IOMMU to track the physical address.  This
 337                  * saves us from having a lot more entries in our mapping
 338                  * tree.  The downside is that we don't track the size
 339                  * used to do the mapping.  We request unmap of a single
 340                  * page, but expect IOMMUs that support large pages to
 341                  * unmap a larger chunk.
 342                  */
 343                 phys = iommu_iova_to_phys(iommu->domain, iova);
 344                 if (WARN_ON(!phys)) {
 345                         iova += PAGE_SIZE;
 346                         continue;
 347                 }
 348
 349                 unmapped = iommu_unmap(iommu->domain, iova, PAGE_SIZE);
 350                 if (!unmapped)
 351                         break;
 352
 353                 unlocked += vfio_unpin_pages(phys >> PAGE_SHIFT,
 354                                              unmapped >> PAGE_SHIFT,
 355                                              dma->prot, false);
 356                 iova += unmapped;
 357         }
 358
 359         vfio_lock_acct(-unlocked);
 360
 361         *size = iova - start;
 362
 363         return 0;
 364 }
 365
 366 static int vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start,
 367                                    size_t *size, struct vfio_dma *dma)
 368 {
 369         size_t offset, overlap, tmp;
 370         struct vfio_dma *split;
 371         int ret;
 372
 373         /*
 374          * Existing dma region is completely covered, unmap all.  This is
 375          * the likely case since userspace tends to map and unmap buffers
 376          * in one shot rather than multiple mappings within a buffer.
 377          */
 378         if (likely(start <= dma->iova &&
 379                    start + *size >= dma->iova + dma->size)) {
 380                 *size = dma->size;
 381                 ret = vfio_unmap_unpin(iommu, dma, dma->iova, size);
 382                 if (ret)
 383                         return ret;
 384
 385                 /*
 386                  * Did we remove more than we have?  Should never happen
 387                  * since a vfio_dma is contiguous in iova and vaddr.
 388                  */
 389                 WARN_ON(*size != dma->size);
 390
 391                 vfio_remove_dma(iommu, dma);
 392                 kfree(dma);
 393                 return 0;
 394         }
 395
 396         /* Overlap low address of existing range */
 397         if (start <= dma->iova) {
 398                 overlap = start + *size - dma->iova;
 399                 ret = vfio_unmap_unpin(iommu, dma, dma->iova, &overlap);
 400                 if (ret)
 401                         return ret;
 402
 403                 vfio_remove_dma(iommu, dma);
 404
 405                 /*
 406                  * Check, we may have removed to whole vfio_dma.  If not
 407                  * fixup and re-insert.
 408                  */
 409                 if (overlap < dma->size) {
 410                         dma->iova += overlap;
 411                         dma->vaddr += overlap;
 412                         dma->size -= overlap;
 413                         vfio_insert_dma(iommu, dma);
 414                 }
 415                 *size = overlap;
 416                 return 0;
 417         }
 418
 419         /* Overlap high address of existing range */
 420         if (start + *size >= dma->iova + dma->size) {
 421                 offset = start - dma->iova;
 422                 overlap = dma->size - offset;
 423
 424                 ret = vfio_unmap_unpin(iommu, dma, start, &overlap);
 425                 if (ret)
 426                         return ret;
 427
 428                 /*
 429                  * We may have unmapped the entire vfio_dma if the user is
 430                  * trying to unmap a sub-region of what was originally
 431                  * mapped.  If anything left, we can resize in place since
 432                  * iova is unchanged.
 433                  */
 434                 if (overlap < dma->size)
 435                         dma->size -= overlap;
 436                 else
 437                         vfio_remove_dma(iommu, dma);
 438
 439                 *size = overlap;
 440                 return 0;
 441         }
 442
 443         /* Split existing */
 444         offset = start - dma->iova;
 445
 446         ret = vfio_unmap_unpin(iommu, dma, start, size);
 447         if (ret)
 448                 return ret;
 449
 450         WARN_ON(!*size);
 451         tmp = dma->size;
 452
 453         /*
 454          * Resize the lower vfio_dma in place, insert new for remaining
 455          * upper segment.
 456          */
 457         dma->size = offset;
 458
 459         if (offset + *size < tmp) {
 460                 split = kzalloc(sizeof(*split), GFP_KERNEL);
 461                 if (!split)
 462                         return -ENOMEM;
 463
 464                 split->size = tmp - offset - *size;
 465                 split->iova = dma->iova + offset + *size;
 466                 split->vaddr = dma->vaddr + offset + *size;
 467                 split->prot = dma->prot;
 468                 vfio_insert_dma(iommu, split);
 469         }
 470
 471         return 0;
 472 }
 473
 474 static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
 475                              struct vfio_iommu_type1_dma_unmap *unmap)
 476 {
 477         uint64_t mask;
 478         struct vfio_dma *dma;
 479         size_t unmapped = 0, size;
 480         int ret = 0;
 481
 482         mask = ((uint64_t)1 << __ffs(iommu->domain->ops->pgsize_bitmap)) - 1;
 483
 484         if (unmap->iova & mask)
 485                 return -EINVAL;
 486         if (unmap->size & mask)
 487                 return -EINVAL;
 488
 489         WARN_ON(mask & PAGE_MASK);
 490
 491         mutex_lock(&iommu->lock);
 492
 493         while ((dma = vfio_find_dma(iommu, unmap->iova, unmap->size))) {
 494                 size = unmap->size;
 495                 ret = vfio_remove_dma_overlap(iommu, unmap->iova, &size, dma);
 496                 if (ret)
 497                         break;
 498                 unmapped += size;
 499         }
 500
 501         mutex_unlock(&iommu->lock);
 502
 503         /*
 504          * We may unmap more than requested, update the unmap struct so
 505          * userspace can know.
 506          */
 507         unmap->size = unmapped;
 508
 509         return ret;
 510 }
 511
 512 /*
 513  * Turns out AMD IOMMU has a page table bug where it won't map large pages
 514  * to a region that previously mapped smaller pages.  This should be fixed
 515  * soon, so this is just a temporary workaround to break mappings down into
 516  * PAGE_SIZE.  Better to map smaller pages than nothing.
 517  */
 518 static int map_try_harder(struct vfio_iommu *iommu, dma_addr_t iova,
 519                           unsigned long pfn, long npage, int prot)
 520 {
 521         long i;
 522         int ret;
 523
 524         for (i = 0; i < npage; i++, pfn++, iova += PAGE_SIZE) {
 525                 ret = iommu_map(iommu->domain, iova,
 526                                 (phys_addr_t)pfn << PAGE_SHIFT,
 527                                 PAGE_SIZE, prot);
 528                 if (ret)
 529                         break;
 530         }
 531
 532         for (; i < npage && i > 0; i--, iova -= PAGE_SIZE)
 533                 iommu_unmap(iommu->domain, iova, PAGE_SIZE);
 534
 535         return ret;
 536 }
 537
 538 static int vfio_dma_do_map(struct vfio_iommu *iommu,
 539                            struct vfio_iommu_type1_dma_map *map)
 540 {
 541         dma_addr_t end, iova;
 542         unsigned long vaddr = map->vaddr;
 543         size_t size = map->size;
 544         long npage;
 545         int ret = 0, prot = 0;
 546         uint64_t mask;
 547
 548         end = map->iova + map->size;
 549
 550         mask = ((uint64_t)1 << __ffs(iommu->domain->ops->pgsize_bitmap)) - 1;
 551
 552         /* READ/WRITE from device perspective */
 553         if (map->flags & VFIO_DMA_MAP_FLAG_WRITE)
 554                 prot |= IOMMU_WRITE;
 555         if (map->flags & VFIO_DMA_MAP_FLAG_READ)
 556                 prot |= IOMMU_READ;
 557
 558         if (!prot)
 559                 return -EINVAL; /* No READ/WRITE? */
 560
 561         if (iommu->cache)
 562                 prot |= IOMMU_CACHE;
 563
 564         if (vaddr & mask)
 565                 return -EINVAL;
 566         if (map->iova & mask)
 567                 return -EINVAL;
 568         if (!map->size || map->size & mask)
 569                 return -EINVAL;
 570
 571         WARN_ON(mask & PAGE_MASK);
 572
 573         /* Don't allow IOVA wrap */
 574         if (end && end < map->iova)
 575                 return -EINVAL;
 576
 577         /* Don't allow virtual address wrap */
 578         if (vaddr + map->size && vaddr + map->size < vaddr)
 579                 return -EINVAL;
 580
 581         mutex_lock(&iommu->lock);
 582
 583         if (vfio_find_dma(iommu, map->iova, map->size)) {
 584                 mutex_unlock(&iommu->lock);
 585                 return -EEXIST;
 586         }
 587
 588         for (iova = map->iova; iova < end; iova += size, vaddr += size) {
 589                 struct vfio_dma *dma = NULL;
 590                 unsigned long pfn;
 591                 long i;
 592
 593                 /* Pin a contiguous chunk of memory */
 594                 npage = vfio_pin_pages(vaddr, (end - iova) >> PAGE_SHIFT,
 595                                        prot, &pfn);
 596                 if (npage <= 0) {
 597                         WARN_ON(!npage);
 598                         ret = (int)npage;
 599                         break;
 600                 }
 601
 602                 /* Verify pages are not already mapped */
 603                 for (i = 0; i < npage; i++) {
 604                         if (iommu_iova_to_phys(iommu->domain,
 605                                                iova + (i << PAGE_SHIFT))) {
 606                                 vfio_unpin_pages(pfn, npage, prot, true);
 607                                 ret = -EBUSY;
 608                                 break;
 609                         }
 610                 }
 611
 612                 ret = iommu_map(iommu->domain, iova,
 613                                 (phys_addr_t)pfn << PAGE_SHIFT,
 614                                 npage << PAGE_SHIFT, prot);
 615                 if (ret) {
 616                         if (ret != -EBUSY ||
 617                             map_try_harder(iommu, iova, pfn, npage, prot)) {
 618                                 vfio_unpin_pages(pfn, npage, prot, true);
 619                                 break;
 620                         }
 621                 }
 622
 623                 size = npage << PAGE_SHIFT;
 624
 625                 /*
 626                  * Check if we abut a region below - nothing below 0.
 627                  * This is the most likely case when mapping chunks of
 628                  * physically contiguous regions within a virtual address
 629                  * range.  Update the abutting entry in place since iova
 630                  * doesn't change.
 631                  */
 632                 if (likely(iova)) {
 633                         struct vfio_dma *tmp;
 634                         tmp = vfio_find_dma(iommu, iova - 1, 1);
 635                         if (tmp && tmp->prot == prot &&
 636                             tmp->vaddr + tmp->size == vaddr) {
 637                                 tmp->size += size;
 638
 639                                 iova = tmp->iova;
 640                                 size = tmp->size;
 641                                 vaddr = tmp->vaddr;
 642                                 dma = tmp;
 643                         }
 644                 }
 645
 646                 /* Check if we abut a region above - nothing above ~0 + 1 */
 647                 if (likely(iova + size)) {
 648                         struct vfio_dma *tmp;
 649
 650                         tmp = vfio_find_dma(iommu, iova + size, 1);
 651                         if (tmp && tmp->prot == prot &&
 652                             tmp->vaddr == vaddr + size) {
 653                                 vfio_remove_dma(iommu, tmp);
 654                                 if (dma)
 655                                         dma->size += tmp->size;
 656                                 else
 657                                         size += tmp->size;
 658                                 kfree(tmp);
 659                         }
 660                 }
 661
 662                 if (!dma) {
 663                         dma = kzalloc(sizeof(*dma), GFP_KERNEL);
 664                         if (!dma) {
 665                                 iommu_unmap(iommu->domain, iova, size);
 666                                 vfio_unpin_pages(pfn, npage, prot, true);
 667                                 ret = -ENOMEM;
 668                                 break;
 669                         }
 670
 671                         dma->size = size;
 672                         dma->iova = iova;
 673                         dma->vaddr = vaddr;
 674                         dma->prot = prot;
 675                         vfio_insert_dma(iommu, dma);
 676                 }
 677         }
 678
 679         if (ret) {
 680                 struct vfio_dma *tmp;
 681                 iova = map->iova;
 682                 size = map->size;
 683                 while ((tmp = vfio_find_dma(iommu, iova, size))) {
 684                         if (vfio_remove_dma_overlap(iommu, iova, &size, tmp)) {
 685                                 pr_warn("%s: Error rolling back failed map\n",
 686                                         __func__);
 687                                 break;
 688                         }
 689                 }
 690         }
 691
 692         mutex_unlock(&iommu->lock);
 693         return ret;
 694 }
 695
 696 static int vfio_iommu_type1_attach_group(void *iommu_data,
 697                                          struct iommu_group *iommu_group)
 698 {
 699         struct vfio_iommu *iommu = iommu_data;
 700         struct vfio_group *group, *tmp;
 701         int ret;
 702
 703         group = kzalloc(sizeof(*group), GFP_KERNEL);
 704         if (!group)
 705                 return -ENOMEM;
 706
 707         mutex_lock(&iommu->lock);
 708
 709         list_for_each_entry(tmp, &iommu->group_list, next) {
 710                 if (tmp->iommu_group == iommu_group) {
 711                         mutex_unlock(&iommu->lock);
 712                         kfree(group);
 713                         return -EINVAL;
 714                 }
 715         }
 716
 717         /*
 718          * TODO: Domain have capabilities that might change as we add
 719          * groups (see iommu->cache, currently never set).  Check for
 720          * them and potentially disallow groups to be attached when it
 721          * would change capabilities (ugh).
 722          */
 723         ret = iommu_attach_group(iommu->domain, iommu_group);
 724         if (ret) {
 725                 mutex_unlock(&iommu->lock);
 726                 kfree(group);
 727                 return ret;
 728         }
 729
 730         group->iommu_group = iommu_group;
 731         list_add(&group->next, &iommu->group_list);
 732
 733         mutex_unlock(&iommu->lock);
 734
 735         return 0;
 736 }
 737
 738 static void vfio_iommu_type1_detach_group(void *iommu_data,
 739                                           struct iommu_group *iommu_group)
 740 {
 741         struct vfio_iommu *iommu = iommu_data;
 742         struct vfio_group *group;
 743
 744         mutex_lock(&iommu->lock);
 745
 746         list_for_each_entry(group, &iommu->group_list, next) {
 747                 if (group->iommu_group == iommu_group) {
 748                         iommu_detach_group(iommu->domain, iommu_group);
 749                         list_del(&group->next);
 750                         kfree(group);
 751                         break;
 752                 }
 753         }
 754
 755         mutex_unlock(&iommu->lock);
 756 }
 757
 758 static void *vfio_iommu_type1_open(unsigned long arg)
 759 {
 760         struct vfio_iommu *iommu;
 761
 762         if (arg != VFIO_TYPE1_IOMMU)
 763                 return ERR_PTR(-EINVAL);
 764
 765         iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
 766         if (!iommu)
 767                 return ERR_PTR(-ENOMEM);
 768
 769         INIT_LIST_HEAD(&iommu->group_list);
 770         iommu->dma_list = RB_ROOT;
 771         mutex_init(&iommu->lock);
 772
 773         /*
 774          * Wish we didn't have to know about bus_type here.
 775          */
 776         iommu->domain = iommu_domain_alloc(&pci_bus_type);
 777         if (!iommu->domain) {
 778                 kfree(iommu);
 779                 return ERR_PTR(-EIO);
 780         }
 781
 782         /*
 783          * Wish we could specify required capabilities rather than create
 784          * a domain, see what comes out and hope it doesn't change along
 785          * the way.  Fortunately we know interrupt remapping is global for
 786          * our iommus.
 787          */
 788         if (!allow_unsafe_interrupts &&
 789             !iommu_domain_has_cap(iommu->domain, IOMMU_CAP_INTR_REMAP)) {
 790                 pr_warn("%s: No interrupt remapping support.  Use the module param \"allow_unsafe_interrupts\" to enable VFIO IOMMU support on this platform\n",
 791                        __func__);
 792                 iommu_domain_free(iommu->domain);
 793                 kfree(iommu);
 794                 return ERR_PTR(-EPERM);
 795         }
 796
 797         return iommu;
 798 }
 799
 800 static void vfio_iommu_type1_release(void *iommu_data)
 801 {
 802         struct vfio_iommu *iommu = iommu_data;
 803         struct vfio_group *group, *group_tmp;
 804         struct rb_node *node;
 805
 806         list_for_each_entry_safe(group, group_tmp, &iommu->group_list, next) {
 807                 iommu_detach_group(iommu->domain, group->iommu_group);
 808                 list_del(&group->next);
 809                 kfree(group);
 810         }
 811
 812         while ((node = rb_first(&iommu->dma_list))) {
 813                 struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node);
 814                 size_t size = dma->size;
 815                 vfio_remove_dma_overlap(iommu, dma->iova, &size, dma);
 816         }
 817
 818         iommu_domain_free(iommu->domain);
 819         iommu->domain = NULL;
 820         kfree(iommu);
 821 }
 822
 823 static long vfio_iommu_type1_ioctl(void *iommu_data,
 824                                    unsigned int cmd, unsigned long arg)
 825 {
 826         struct vfio_iommu *iommu = iommu_data;
 827         unsigned long minsz;
 828
 829         if (cmd == VFIO_CHECK_EXTENSION) {
 830                 switch (arg) {
 831                 case VFIO_TYPE1_IOMMU:
 832                         return 1;
 833                 default:
 834                         return 0;
 835                 }
 836         } else if (cmd == VFIO_IOMMU_GET_INFO) {
 837                 struct vfio_iommu_type1_info info;
 838
 839                 minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes);
 840
 841                 if (copy_from_user(&info, (void __user *)arg, minsz))
 842                         return -EFAULT;
 843
 844                 if (info.argsz < minsz)
 845                         return -EINVAL;
 846
 847                 info.flags = 0;
 848
 849                 info.iova_pgsizes = iommu->domain->ops->pgsize_bitmap;
 850
 851                 return copy_to_user((void __user *)arg, &info, minsz);
 852
 853         } else if (cmd == VFIO_IOMMU_MAP_DMA) {
 854                 struct vfio_iommu_type1_dma_map map;
 855                 uint32_t mask = VFIO_DMA_MAP_FLAG_READ |
 856                                 VFIO_DMA_MAP_FLAG_WRITE;
 857
 858                 minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
 859
 860                 if (copy_from_user(&map, (void __user *)arg, minsz))
 861                         return -EFAULT;
 862
 863                 if (map.argsz < minsz || map.flags & ~mask)
 864                         return -EINVAL;
 865
 866                 return vfio_dma_do_map(iommu, &map);
 867
 868         } else if (cmd == VFIO_IOMMU_UNMAP_DMA) {
 869                 struct vfio_iommu_type1_dma_unmap unmap;
 870                 long ret;
 871
 872                 minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size);
 873
 874                 if (copy_from_user(&unmap, (void __user *)arg, minsz))
 875                         return -EFAULT;
 876
 877                 if (unmap.argsz < minsz || unmap.flags)
 878                         return -EINVAL;
 879
 880                 ret = vfio_dma_do_unmap(iommu, &unmap);
 881                 if (ret)
 882                         return ret;
 883
 884                 return copy_to_user((void __user *)arg, &unmap, minsz);
 885         }
 886
 887         return -ENOTTY;
 888 }
 889
 890 static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
 891         .name           = "vfio-iommu-type1",
 892         .owner          = THIS_MODULE,
 893         .open           = vfio_iommu_type1_open,
 894         .release        = vfio_iommu_type1_release,
 895         .ioctl          = vfio_iommu_type1_ioctl,
 896         .attach_group   = vfio_iommu_type1_attach_group,
 897         .detach_group   = vfio_iommu_type1_detach_group,
 898 };
 899
 900 static int __init vfio_iommu_type1_init(void)
 901 {
 902         if (!iommu_present(&pci_bus_type))
 903                 return -ENODEV;
 904
 905         return vfio_register_iommu_driver(&vfio_iommu_driver_ops_type1);
 906 }
 907
 908 static void __exit vfio_iommu_type1_cleanup(void)
 909 {
 910         vfio_unregister_iommu_driver(&vfio_iommu_driver_ops_type1);
 911 }
 912
 913 module_init(vfio_iommu_type1_init);
 914 module_exit(vfio_iommu_type1_cleanup);
 915
 916 MODULE_VERSION(DRIVER_VERSION);
 917 MODULE_LICENSE("GPL v2");
 918 MODULE_AUTHOR(DRIVER_AUTHOR);
 919 MODULE_DESCRIPTION(DRIVER_DESC);