Merge tag 'armsoc-defconfig' of git://git.kernel.org/pub/scm/linux/kernel/git/arm...
[deliverable/linux.git] / drivers / vfio / vfio.c
CommitLineData
cba3345c
AW
1/*
2 * VFIO core
3 *
4 * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
5 * Author: Alex Williamson <alex.williamson@redhat.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 *
11 * Derived from original vfio:
12 * Copyright 2010 Cisco Systems, Inc. All rights reserved.
13 * Author: Tom Lyon, pugs@cisco.com
14 */
15
16#include <linux/cdev.h>
17#include <linux/compat.h>
18#include <linux/device.h>
19#include <linux/file.h>
20#include <linux/anon_inodes.h>
21#include <linux/fs.h>
22#include <linux/idr.h>
23#include <linux/iommu.h>
24#include <linux/list.h>
d1099901 25#include <linux/miscdevice.h>
cba3345c
AW
26#include <linux/module.h>
27#include <linux/mutex.h>
5f096b14 28#include <linux/pci.h>
9587f44a 29#include <linux/rwsem.h>
cba3345c
AW
30#include <linux/sched.h>
31#include <linux/slab.h>
664e9386 32#include <linux/stat.h>
cba3345c
AW
33#include <linux/string.h>
34#include <linux/uaccess.h>
35#include <linux/vfio.h>
36#include <linux/wait.h>
37
38#define DRIVER_VERSION "0.3"
39#define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>"
40#define DRIVER_DESC "VFIO - User Level meta-driver"
41
42static struct vfio {
43 struct class *class;
44 struct list_head iommu_drivers_list;
45 struct mutex iommu_drivers_lock;
46 struct list_head group_list;
47 struct idr group_idr;
48 struct mutex group_lock;
49 struct cdev group_cdev;
d1099901 50 dev_t group_devt;
cba3345c
AW
51 wait_queue_head_t release_q;
52} vfio;
53
54struct vfio_iommu_driver {
55 const struct vfio_iommu_driver_ops *ops;
56 struct list_head vfio_next;
57};
58
59struct vfio_container {
60 struct kref kref;
61 struct list_head group_list;
9587f44a 62 struct rw_semaphore group_lock;
cba3345c
AW
63 struct vfio_iommu_driver *iommu_driver;
64 void *iommu_data;
03a76b60 65 bool noiommu;
cba3345c
AW
66};
67
60720a0f
AW
68struct vfio_unbound_dev {
69 struct device *dev;
70 struct list_head unbound_next;
71};
72
cba3345c
AW
73struct vfio_group {
74 struct kref kref;
75 int minor;
76 atomic_t container_users;
77 struct iommu_group *iommu_group;
78 struct vfio_container *container;
79 struct list_head device_list;
80 struct mutex device_lock;
81 struct device *dev;
82 struct notifier_block nb;
83 struct list_head vfio_next;
84 struct list_head container_next;
60720a0f
AW
85 struct list_head unbound_list;
86 struct mutex unbound_lock;
6d6768c6 87 atomic_t opened;
03a76b60 88 bool noiommu;
cba3345c
AW
89};
90
91struct vfio_device {
92 struct kref kref;
93 struct device *dev;
94 const struct vfio_device_ops *ops;
95 struct vfio_group *group;
96 struct list_head group_next;
97 void *device_data;
98};
99
03a76b60
AW
100#ifdef CONFIG_VFIO_NOIOMMU
101static bool noiommu __read_mostly;
102module_param_named(enable_unsafe_noiommu_mode,
103 noiommu, bool, S_IRUGO | S_IWUSR);
104MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode. This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel. If you do not know what this is for, step away. (default: false)");
105#endif
106
107/*
108 * vfio_iommu_group_{get,put} are only intended for VFIO bus driver probe
109 * and remove functions, any use cases other than acquiring the first
110 * reference for the purpose of calling vfio_add_group_dev() or removing
111 * that symmetric reference after vfio_del_group_dev() should use the raw
112 * iommu_group_{get,put} functions. In particular, vfio_iommu_group_put()
113 * removes the device from the dummy group and cannot be nested.
114 */
115struct iommu_group *vfio_iommu_group_get(struct device *dev)
116{
117 struct iommu_group *group;
118 int __maybe_unused ret;
119
120 group = iommu_group_get(dev);
121
122#ifdef CONFIG_VFIO_NOIOMMU
123 /*
124 * With noiommu enabled, an IOMMU group will be created for a device
125 * that doesn't already have one and doesn't have an iommu_ops on their
126 * bus. We use iommu_present() again in the main code to detect these
127 * fake groups.
128 */
129 if (group || !noiommu || iommu_present(dev->bus))
130 return group;
131
132 group = iommu_group_alloc();
133 if (IS_ERR(group))
134 return NULL;
135
136 iommu_group_set_name(group, "vfio-noiommu");
137 ret = iommu_group_add_device(group, dev);
138 iommu_group_put(group);
139 if (ret)
140 return NULL;
141
142 /*
143 * Where to taint? At this point we've added an IOMMU group for a
144 * device that is not backed by iommu_ops, therefore any iommu_
145 * callback using iommu_ops can legitimately Oops. So, while we may
146 * be about to give a DMA capable device to a user without IOMMU
147 * protection, which is clearly taint-worthy, let's go ahead and do
148 * it here.
149 */
150 add_taint(TAINT_USER, LOCKDEP_STILL_OK);
151 dev_warn(dev, "Adding kernel taint for vfio-noiommu group on device\n");
152#endif
153
154 return group;
155}
156EXPORT_SYMBOL_GPL(vfio_iommu_group_get);
157
158void vfio_iommu_group_put(struct iommu_group *group, struct device *dev)
159{
160#ifdef CONFIG_VFIO_NOIOMMU
161 if (!iommu_present(dev->bus))
162 iommu_group_remove_device(dev);
163#endif
164
165 iommu_group_put(group);
166}
167EXPORT_SYMBOL_GPL(vfio_iommu_group_put);
168
169#ifdef CONFIG_VFIO_NOIOMMU
170static void *vfio_noiommu_open(unsigned long arg)
171{
172 if (arg != VFIO_NOIOMMU_IOMMU)
173 return ERR_PTR(-EINVAL);
174 if (!capable(CAP_SYS_RAWIO))
175 return ERR_PTR(-EPERM);
176
177 return NULL;
178}
179
180static void vfio_noiommu_release(void *iommu_data)
181{
182}
183
184static long vfio_noiommu_ioctl(void *iommu_data,
185 unsigned int cmd, unsigned long arg)
186{
187 if (cmd == VFIO_CHECK_EXTENSION)
188 return noiommu && (arg == VFIO_NOIOMMU_IOMMU) ? 1 : 0;
189
190 return -ENOTTY;
191}
192
193static int vfio_iommu_present(struct device *dev, void *unused)
194{
195 return iommu_present(dev->bus) ? 1 : 0;
196}
197
198static int vfio_noiommu_attach_group(void *iommu_data,
199 struct iommu_group *iommu_group)
200{
201 return iommu_group_for_each_dev(iommu_group, NULL,
202 vfio_iommu_present) ? -EINVAL : 0;
203}
204
205static void vfio_noiommu_detach_group(void *iommu_data,
206 struct iommu_group *iommu_group)
207{
208}
209
210static const struct vfio_iommu_driver_ops vfio_noiommu_ops = {
211 .name = "vfio-noiommu",
212 .owner = THIS_MODULE,
213 .open = vfio_noiommu_open,
214 .release = vfio_noiommu_release,
215 .ioctl = vfio_noiommu_ioctl,
216 .attach_group = vfio_noiommu_attach_group,
217 .detach_group = vfio_noiommu_detach_group,
218};
219#endif
220
221
cba3345c
AW
222/**
223 * IOMMU driver registration
224 */
225int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops)
226{
227 struct vfio_iommu_driver *driver, *tmp;
228
229 driver = kzalloc(sizeof(*driver), GFP_KERNEL);
230 if (!driver)
231 return -ENOMEM;
232
233 driver->ops = ops;
234
235 mutex_lock(&vfio.iommu_drivers_lock);
236
237 /* Check for duplicates */
238 list_for_each_entry(tmp, &vfio.iommu_drivers_list, vfio_next) {
239 if (tmp->ops == ops) {
240 mutex_unlock(&vfio.iommu_drivers_lock);
241 kfree(driver);
242 return -EINVAL;
243 }
244 }
245
246 list_add(&driver->vfio_next, &vfio.iommu_drivers_list);
247
248 mutex_unlock(&vfio.iommu_drivers_lock);
249
250 return 0;
251}
252EXPORT_SYMBOL_GPL(vfio_register_iommu_driver);
253
254void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops)
255{
256 struct vfio_iommu_driver *driver;
257
258 mutex_lock(&vfio.iommu_drivers_lock);
259 list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
260 if (driver->ops == ops) {
261 list_del(&driver->vfio_next);
262 mutex_unlock(&vfio.iommu_drivers_lock);
263 kfree(driver);
264 return;
265 }
266 }
267 mutex_unlock(&vfio.iommu_drivers_lock);
268}
269EXPORT_SYMBOL_GPL(vfio_unregister_iommu_driver);
270
271/**
272 * Group minor allocation/free - both called with vfio.group_lock held
273 */
274static int vfio_alloc_group_minor(struct vfio_group *group)
275{
d1099901 276 return idr_alloc(&vfio.group_idr, group, 0, MINORMASK + 1, GFP_KERNEL);
cba3345c
AW
277}
278
279static void vfio_free_group_minor(int minor)
280{
281 idr_remove(&vfio.group_idr, minor);
282}
283
284static int vfio_iommu_group_notifier(struct notifier_block *nb,
285 unsigned long action, void *data);
286static void vfio_group_get(struct vfio_group *group);
287
288/**
289 * Container objects - containers are created when /dev/vfio/vfio is
290 * opened, but their lifecycle extends until the last user is done, so
291 * it's freed via kref. Must support container/group/device being
292 * closed in any order.
293 */
294static void vfio_container_get(struct vfio_container *container)
295{
296 kref_get(&container->kref);
297}
298
299static void vfio_container_release(struct kref *kref)
300{
301 struct vfio_container *container;
302 container = container_of(kref, struct vfio_container, kref);
303
304 kfree(container);
305}
306
307static void vfio_container_put(struct vfio_container *container)
308{
309 kref_put(&container->kref, vfio_container_release);
310}
311
9df7b25a
JL
312static void vfio_group_unlock_and_free(struct vfio_group *group)
313{
314 mutex_unlock(&vfio.group_lock);
315 /*
316 * Unregister outside of lock. A spurious callback is harmless now
317 * that the group is no longer in vfio.group_list.
318 */
319 iommu_group_unregister_notifier(group->iommu_group, &group->nb);
320 kfree(group);
321}
322
cba3345c
AW
323/**
324 * Group objects - create, release, get, put, search
325 */
03a76b60
AW
326static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group,
327 bool iommu_present)
cba3345c
AW
328{
329 struct vfio_group *group, *tmp;
330 struct device *dev;
331 int ret, minor;
332
333 group = kzalloc(sizeof(*group), GFP_KERNEL);
334 if (!group)
335 return ERR_PTR(-ENOMEM);
336
337 kref_init(&group->kref);
338 INIT_LIST_HEAD(&group->device_list);
339 mutex_init(&group->device_lock);
60720a0f
AW
340 INIT_LIST_HEAD(&group->unbound_list);
341 mutex_init(&group->unbound_lock);
cba3345c 342 atomic_set(&group->container_users, 0);
6d6768c6 343 atomic_set(&group->opened, 0);
cba3345c 344 group->iommu_group = iommu_group;
03a76b60 345 group->noiommu = !iommu_present;
cba3345c
AW
346
347 group->nb.notifier_call = vfio_iommu_group_notifier;
348
349 /*
350 * blocking notifiers acquire a rwsem around registering and hold
351 * it around callback. Therefore, need to register outside of
352 * vfio.group_lock to avoid A-B/B-A contention. Our callback won't
353 * do anything unless it can find the group in vfio.group_list, so
354 * no harm in registering early.
355 */
356 ret = iommu_group_register_notifier(iommu_group, &group->nb);
357 if (ret) {
358 kfree(group);
359 return ERR_PTR(ret);
360 }
361
362 mutex_lock(&vfio.group_lock);
363
cba3345c
AW
364 /* Did we race creating this group? */
365 list_for_each_entry(tmp, &vfio.group_list, vfio_next) {
366 if (tmp->iommu_group == iommu_group) {
367 vfio_group_get(tmp);
9df7b25a 368 vfio_group_unlock_and_free(group);
cba3345c
AW
369 return tmp;
370 }
371 }
372
2f51bf4b
ZL
373 minor = vfio_alloc_group_minor(group);
374 if (minor < 0) {
375 vfio_group_unlock_and_free(group);
376 return ERR_PTR(minor);
377 }
378
d1099901
AW
379 dev = device_create(vfio.class, NULL,
380 MKDEV(MAJOR(vfio.group_devt), minor),
03a76b60
AW
381 group, "%s%d", group->noiommu ? "noiommu-" : "",
382 iommu_group_id(iommu_group));
cba3345c
AW
383 if (IS_ERR(dev)) {
384 vfio_free_group_minor(minor);
9df7b25a 385 vfio_group_unlock_and_free(group);
cba3345c
AW
386 return (struct vfio_group *)dev; /* ERR_PTR */
387 }
388
389 group->minor = minor;
390 group->dev = dev;
391
392 list_add(&group->vfio_next, &vfio.group_list);
393
394 mutex_unlock(&vfio.group_lock);
395
396 return group;
397}
398
6d2cd3ce 399/* called with vfio.group_lock held */
cba3345c
AW
400static void vfio_group_release(struct kref *kref)
401{
402 struct vfio_group *group = container_of(kref, struct vfio_group, kref);
60720a0f 403 struct vfio_unbound_dev *unbound, *tmp;
4a68810d 404 struct iommu_group *iommu_group = group->iommu_group;
cba3345c
AW
405
406 WARN_ON(!list_empty(&group->device_list));
407
60720a0f
AW
408 list_for_each_entry_safe(unbound, tmp,
409 &group->unbound_list, unbound_next) {
410 list_del(&unbound->unbound_next);
411 kfree(unbound);
412 }
413
d1099901 414 device_destroy(vfio.class, MKDEV(MAJOR(vfio.group_devt), group->minor));
cba3345c
AW
415 list_del(&group->vfio_next);
416 vfio_free_group_minor(group->minor);
9df7b25a 417 vfio_group_unlock_and_free(group);
4a68810d 418 iommu_group_put(iommu_group);
cba3345c
AW
419}
420
421static void vfio_group_put(struct vfio_group *group)
422{
6d2cd3ce 423 kref_put_mutex(&group->kref, vfio_group_release, &vfio.group_lock);
cba3345c
AW
424}
425
426/* Assume group_lock or group reference is held */
427static void vfio_group_get(struct vfio_group *group)
428{
429 kref_get(&group->kref);
430}
431
432/*
433 * Not really a try as we will sleep for mutex, but we need to make
434 * sure the group pointer is valid under lock and get a reference.
435 */
436static struct vfio_group *vfio_group_try_get(struct vfio_group *group)
437{
438 struct vfio_group *target = group;
439
440 mutex_lock(&vfio.group_lock);
441 list_for_each_entry(group, &vfio.group_list, vfio_next) {
442 if (group == target) {
443 vfio_group_get(group);
444 mutex_unlock(&vfio.group_lock);
445 return group;
446 }
447 }
448 mutex_unlock(&vfio.group_lock);
449
450 return NULL;
451}
452
453static
454struct vfio_group *vfio_group_get_from_iommu(struct iommu_group *iommu_group)
455{
456 struct vfio_group *group;
457
458 mutex_lock(&vfio.group_lock);
459 list_for_each_entry(group, &vfio.group_list, vfio_next) {
460 if (group->iommu_group == iommu_group) {
461 vfio_group_get(group);
462 mutex_unlock(&vfio.group_lock);
463 return group;
464 }
465 }
466 mutex_unlock(&vfio.group_lock);
467
468 return NULL;
469}
470
471static struct vfio_group *vfio_group_get_from_minor(int minor)
472{
473 struct vfio_group *group;
474
475 mutex_lock(&vfio.group_lock);
476 group = idr_find(&vfio.group_idr, minor);
477 if (!group) {
478 mutex_unlock(&vfio.group_lock);
479 return NULL;
480 }
481 vfio_group_get(group);
482 mutex_unlock(&vfio.group_lock);
483
484 return group;
485}
486
487/**
488 * Device objects - create, release, get, put, search
489 */
490static
491struct vfio_device *vfio_group_create_device(struct vfio_group *group,
492 struct device *dev,
493 const struct vfio_device_ops *ops,
494 void *device_data)
495{
496 struct vfio_device *device;
cba3345c
AW
497
498 device = kzalloc(sizeof(*device), GFP_KERNEL);
499 if (!device)
500 return ERR_PTR(-ENOMEM);
501
502 kref_init(&device->kref);
503 device->dev = dev;
504 device->group = group;
505 device->ops = ops;
506 device->device_data = device_data;
8283b491 507 dev_set_drvdata(dev, device);
cba3345c
AW
508
509 /* No need to get group_lock, caller has group reference */
510 vfio_group_get(group);
511
512 mutex_lock(&group->device_lock);
513 list_add(&device->group_next, &group->device_list);
514 mutex_unlock(&group->device_lock);
515
516 return device;
517}
518
519static void vfio_device_release(struct kref *kref)
520{
521 struct vfio_device *device = container_of(kref,
522 struct vfio_device, kref);
523 struct vfio_group *group = device->group;
524
cba3345c
AW
525 list_del(&device->group_next);
526 mutex_unlock(&group->device_lock);
527
528 dev_set_drvdata(device->dev, NULL);
529
530 kfree(device);
531
532 /* vfio_del_group_dev may be waiting for this device */
533 wake_up(&vfio.release_q);
534}
535
536/* Device reference always implies a group reference */
44f50716 537void vfio_device_put(struct vfio_device *device)
cba3345c 538{
934ad4c2 539 struct vfio_group *group = device->group;
90b1253e 540 kref_put_mutex(&device->kref, vfio_device_release, &group->device_lock);
934ad4c2 541 vfio_group_put(group);
cba3345c 542}
44f50716 543EXPORT_SYMBOL_GPL(vfio_device_put);
cba3345c
AW
544
545static void vfio_device_get(struct vfio_device *device)
546{
547 vfio_group_get(device->group);
548 kref_get(&device->kref);
549}
550
551static struct vfio_device *vfio_group_get_device(struct vfio_group *group,
552 struct device *dev)
553{
554 struct vfio_device *device;
555
556 mutex_lock(&group->device_lock);
557 list_for_each_entry(device, &group->device_list, group_next) {
558 if (device->dev == dev) {
559 vfio_device_get(device);
560 mutex_unlock(&group->device_lock);
561 return device;
562 }
563 }
564 mutex_unlock(&group->device_lock);
565 return NULL;
566}
567
568/*
5f096b14
AW
569 * Some drivers, like pci-stub, are only used to prevent other drivers from
570 * claiming a device and are therefore perfectly legitimate for a user owned
571 * group. The pci-stub driver has no dependencies on DMA or the IOVA mapping
572 * of the device, but it does prevent the user from having direct access to
573 * the device, which is useful in some circumstances.
574 *
575 * We also assume that we can include PCI interconnect devices, ie. bridges.
576 * IOMMU grouping on PCI necessitates that if we lack isolation on a bridge
577 * then all of the downstream devices will be part of the same IOMMU group as
578 * the bridge. Thus, if placing the bridge into the user owned IOVA space
579 * breaks anything, it only does so for user owned devices downstream. Note
580 * that error notification via MSI can be affected for platforms that handle
581 * MSI within the same IOVA space as DMA.
cba3345c 582 */
5f096b14 583static const char * const vfio_driver_whitelist[] = { "pci-stub" };
cba3345c 584
5f096b14 585static bool vfio_dev_whitelisted(struct device *dev, struct device_driver *drv)
cba3345c
AW
586{
587 int i;
588
5f096b14
AW
589 if (dev_is_pci(dev)) {
590 struct pci_dev *pdev = to_pci_dev(dev);
591
592 if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL)
593 return true;
594 }
595
cba3345c
AW
596 for (i = 0; i < ARRAY_SIZE(vfio_driver_whitelist); i++) {
597 if (!strcmp(drv->name, vfio_driver_whitelist[i]))
598 return true;
599 }
600
601 return false;
602}
603
604/*
60720a0f
AW
605 * A vfio group is viable for use by userspace if all devices are in
606 * one of the following states:
607 * - driver-less
608 * - bound to a vfio driver
609 * - bound to a whitelisted driver
5f096b14 610 * - a PCI interconnect device
60720a0f
AW
611 *
612 * We use two methods to determine whether a device is bound to a vfio
613 * driver. The first is to test whether the device exists in the vfio
614 * group. The second is to test if the device exists on the group
615 * unbound_list, indicating it's in the middle of transitioning from
616 * a vfio driver to driver-less.
cba3345c
AW
617 */
618static int vfio_dev_viable(struct device *dev, void *data)
619{
620 struct vfio_group *group = data;
621 struct vfio_device *device;
de2b3eea 622 struct device_driver *drv = ACCESS_ONCE(dev->driver);
60720a0f
AW
623 struct vfio_unbound_dev *unbound;
624 int ret = -EINVAL;
625
626 mutex_lock(&group->unbound_lock);
627 list_for_each_entry(unbound, &group->unbound_list, unbound_next) {
628 if (dev == unbound->dev) {
629 ret = 0;
630 break;
631 }
632 }
633 mutex_unlock(&group->unbound_lock);
cba3345c 634
5f096b14 635 if (!ret || !drv || vfio_dev_whitelisted(dev, drv))
cba3345c
AW
636 return 0;
637
638 device = vfio_group_get_device(group, dev);
639 if (device) {
640 vfio_device_put(device);
641 return 0;
642 }
643
60720a0f 644 return ret;
cba3345c
AW
645}
646
647/**
648 * Async device support
649 */
650static int vfio_group_nb_add_dev(struct vfio_group *group, struct device *dev)
651{
652 struct vfio_device *device;
653
654 /* Do we already know about it? We shouldn't */
655 device = vfio_group_get_device(group, dev);
656 if (WARN_ON_ONCE(device)) {
657 vfio_device_put(device);
658 return 0;
659 }
660
661 /* Nothing to do for idle groups */
662 if (!atomic_read(&group->container_users))
663 return 0;
664
665 /* TODO Prevent device auto probing */
049af106 666 WARN(1, "Device %s added to live group %d!\n", dev_name(dev),
cba3345c
AW
667 iommu_group_id(group->iommu_group));
668
669 return 0;
670}
671
cba3345c
AW
672static int vfio_group_nb_verify(struct vfio_group *group, struct device *dev)
673{
674 /* We don't care what happens when the group isn't in use */
675 if (!atomic_read(&group->container_users))
676 return 0;
677
678 return vfio_dev_viable(dev, group);
679}
680
681static int vfio_iommu_group_notifier(struct notifier_block *nb,
682 unsigned long action, void *data)
683{
684 struct vfio_group *group = container_of(nb, struct vfio_group, nb);
685 struct device *dev = data;
60720a0f 686 struct vfio_unbound_dev *unbound;
cba3345c
AW
687
688 /*
c6401930
AW
689 * Need to go through a group_lock lookup to get a reference or we
690 * risk racing a group being removed. Ignore spurious notifies.
cba3345c
AW
691 */
692 group = vfio_group_try_get(group);
c6401930 693 if (!group)
cba3345c
AW
694 return NOTIFY_OK;
695
696 switch (action) {
697 case IOMMU_GROUP_NOTIFY_ADD_DEVICE:
698 vfio_group_nb_add_dev(group, dev);
699 break;
700 case IOMMU_GROUP_NOTIFY_DEL_DEVICE:
de9c7602
AW
701 /*
702 * Nothing to do here. If the device is in use, then the
703 * vfio sub-driver should block the remove callback until
704 * it is unused. If the device is unused or attached to a
705 * stub driver, then it should be released and we don't
706 * care that it will be going away.
707 */
cba3345c
AW
708 break;
709 case IOMMU_GROUP_NOTIFY_BIND_DRIVER:
710 pr_debug("%s: Device %s, group %d binding to driver\n",
711 __func__, dev_name(dev),
712 iommu_group_id(group->iommu_group));
713 break;
714 case IOMMU_GROUP_NOTIFY_BOUND_DRIVER:
715 pr_debug("%s: Device %s, group %d bound to driver %s\n",
716 __func__, dev_name(dev),
717 iommu_group_id(group->iommu_group), dev->driver->name);
718 BUG_ON(vfio_group_nb_verify(group, dev));
719 break;
720 case IOMMU_GROUP_NOTIFY_UNBIND_DRIVER:
721 pr_debug("%s: Device %s, group %d unbinding from driver %s\n",
722 __func__, dev_name(dev),
723 iommu_group_id(group->iommu_group), dev->driver->name);
724 break;
725 case IOMMU_GROUP_NOTIFY_UNBOUND_DRIVER:
726 pr_debug("%s: Device %s, group %d unbound from driver\n",
727 __func__, dev_name(dev),
728 iommu_group_id(group->iommu_group));
729 /*
730 * XXX An unbound device in a live group is ok, but we'd
731 * really like to avoid the above BUG_ON by preventing other
732 * drivers from binding to it. Once that occurs, we have to
733 * stop the system to maintain isolation. At a minimum, we'd
734 * want a toggle to disable driver auto probe for this device.
735 */
60720a0f
AW
736
737 mutex_lock(&group->unbound_lock);
738 list_for_each_entry(unbound,
739 &group->unbound_list, unbound_next) {
740 if (dev == unbound->dev) {
741 list_del(&unbound->unbound_next);
742 kfree(unbound);
743 break;
744 }
745 }
746 mutex_unlock(&group->unbound_lock);
cba3345c
AW
747 break;
748 }
749
750 vfio_group_put(group);
751 return NOTIFY_OK;
752}
753
754/**
755 * VFIO driver API
756 */
757int vfio_add_group_dev(struct device *dev,
758 const struct vfio_device_ops *ops, void *device_data)
759{
760 struct iommu_group *iommu_group;
761 struct vfio_group *group;
762 struct vfio_device *device;
763
764 iommu_group = iommu_group_get(dev);
765 if (!iommu_group)
766 return -EINVAL;
767
768 group = vfio_group_get_from_iommu(iommu_group);
769 if (!group) {
03a76b60 770 group = vfio_create_group(iommu_group, iommu_present(dev->bus));
cba3345c
AW
771 if (IS_ERR(group)) {
772 iommu_group_put(iommu_group);
773 return PTR_ERR(group);
774 }
4a68810d
AW
775 } else {
776 /*
777 * A found vfio_group already holds a reference to the
778 * iommu_group. A created vfio_group keeps the reference.
779 */
780 iommu_group_put(iommu_group);
cba3345c
AW
781 }
782
783 device = vfio_group_get_device(group, dev);
784 if (device) {
785 WARN(1, "Device %s already exists on group %d\n",
786 dev_name(dev), iommu_group_id(iommu_group));
787 vfio_device_put(device);
788 vfio_group_put(group);
cba3345c
AW
789 return -EBUSY;
790 }
791
792 device = vfio_group_create_device(group, dev, ops, device_data);
793 if (IS_ERR(device)) {
794 vfio_group_put(group);
cba3345c
AW
795 return PTR_ERR(device);
796 }
797
798 /*
4a68810d
AW
799 * Drop all but the vfio_device reference. The vfio_device holds
800 * a reference to the vfio_group, which holds a reference to the
801 * iommu_group.
cba3345c
AW
802 */
803 vfio_group_put(group);
804
805 return 0;
806}
807EXPORT_SYMBOL_GPL(vfio_add_group_dev);
808
44f50716 809/**
20f30017
AW
810 * Get a reference to the vfio_device for a device. Even if the
811 * caller thinks they own the device, they could be racing with a
812 * release call path, so we can't trust drvdata for the shortcut.
813 * Go the long way around, from the iommu_group to the vfio_group
814 * to the vfio_device.
44f50716
VMP
815 */
816struct vfio_device *vfio_device_get_from_dev(struct device *dev)
817{
20f30017
AW
818 struct iommu_group *iommu_group;
819 struct vfio_group *group;
820 struct vfio_device *device;
821
822 iommu_group = iommu_group_get(dev);
823 if (!iommu_group)
824 return NULL;
44f50716 825
20f30017
AW
826 group = vfio_group_get_from_iommu(iommu_group);
827 iommu_group_put(iommu_group);
828 if (!group)
829 return NULL;
830
831 device = vfio_group_get_device(group, dev);
832 vfio_group_put(group);
44f50716
VMP
833
834 return device;
835}
836EXPORT_SYMBOL_GPL(vfio_device_get_from_dev);
837
4bc94d5d
AW
838static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group,
839 char *buf)
840{
e324fc82 841 struct vfio_device *it, *device = NULL;
4bc94d5d
AW
842
843 mutex_lock(&group->device_lock);
e324fc82
JR
844 list_for_each_entry(it, &group->device_list, group_next) {
845 if (!strcmp(dev_name(it->dev), buf)) {
846 device = it;
4bc94d5d
AW
847 vfio_device_get(device);
848 break;
849 }
850 }
851 mutex_unlock(&group->device_lock);
852
853 return device;
854}
855
44f50716
VMP
856/*
857 * Caller must hold a reference to the vfio_device
858 */
859void *vfio_device_data(struct vfio_device *device)
860{
861 return device->device_data;
862}
863EXPORT_SYMBOL_GPL(vfio_device_data);
864
e014e944
AW
865/* Given a referenced group, check if it contains the device */
866static bool vfio_dev_present(struct vfio_group *group, struct device *dev)
cba3345c 867{
cba3345c
AW
868 struct vfio_device *device;
869
cba3345c 870 device = vfio_group_get_device(group, dev);
e014e944 871 if (!device)
cba3345c 872 return false;
cba3345c
AW
873
874 vfio_device_put(device);
cba3345c
AW
875 return true;
876}
877
878/*
879 * Decrement the device reference count and wait for the device to be
880 * removed. Open file descriptors for the device... */
881void *vfio_del_group_dev(struct device *dev)
882{
883 struct vfio_device *device = dev_get_drvdata(dev);
884 struct vfio_group *group = device->group;
cba3345c 885 void *device_data = device->device_data;
60720a0f 886 struct vfio_unbound_dev *unbound;
13060b64 887 unsigned int i = 0;
db7d4d7f
AW
888 long ret;
889 bool interrupted = false;
cba3345c 890
e014e944
AW
891 /*
892 * The group exists so long as we have a device reference. Get
893 * a group reference and use it to scan for the device going away.
894 */
895 vfio_group_get(group);
896
60720a0f
AW
897 /*
898 * When the device is removed from the group, the group suddenly
899 * becomes non-viable; the device has a driver (until the unbind
900 * completes), but it's not present in the group. This is bad news
901 * for any external users that need to re-acquire a group reference
902 * in order to match and release their existing reference. To
903 * solve this, we track such devices on the unbound_list to bridge
904 * the gap until they're fully unbound.
905 */
906 unbound = kzalloc(sizeof(*unbound), GFP_KERNEL);
907 if (unbound) {
908 unbound->dev = dev;
909 mutex_lock(&group->unbound_lock);
910 list_add(&unbound->unbound_next, &group->unbound_list);
911 mutex_unlock(&group->unbound_lock);
912 }
913 WARN_ON(!unbound);
914
cba3345c
AW
915 vfio_device_put(device);
916
13060b64
AW
917 /*
918 * If the device is still present in the group after the above
919 * 'put', then it is in use and we need to request it from the
920 * bus driver. The driver may in turn need to request the
921 * device from the user. We send the request on an arbitrary
922 * interval with counter to allow the driver to take escalating
923 * measures to release the device if it has the ability to do so.
924 */
925 do {
926 device = vfio_group_get_device(group, dev);
927 if (!device)
928 break;
929
930 if (device->ops->request)
931 device->ops->request(device_data, i++);
932
933 vfio_device_put(device);
934
db7d4d7f
AW
935 if (interrupted) {
936 ret = wait_event_timeout(vfio.release_q,
937 !vfio_dev_present(group, dev), HZ * 10);
938 } else {
939 ret = wait_event_interruptible_timeout(vfio.release_q,
940 !vfio_dev_present(group, dev), HZ * 10);
941 if (ret == -ERESTARTSYS) {
942 interrupted = true;
943 dev_warn(dev,
944 "Device is currently in use, task"
945 " \"%s\" (%d) "
946 "blocked until device is released",
947 current->comm, task_pid_nr(current));
948 }
949 }
950 } while (ret <= 0);
e014e944
AW
951
952 vfio_group_put(group);
cba3345c 953
cba3345c
AW
954 return device_data;
955}
956EXPORT_SYMBOL_GPL(vfio_del_group_dev);
957
958/**
959 * VFIO base fd, /dev/vfio/vfio
960 */
961static long vfio_ioctl_check_extension(struct vfio_container *container,
962 unsigned long arg)
963{
0b43c082 964 struct vfio_iommu_driver *driver;
cba3345c
AW
965 long ret = 0;
966
0b43c082
AW
967 down_read(&container->group_lock);
968
969 driver = container->iommu_driver;
970
cba3345c
AW
971 switch (arg) {
972 /* No base extensions yet */
973 default:
974 /*
975 * If no driver is set, poll all registered drivers for
976 * extensions and return the first positive result. If
977 * a driver is already set, further queries will be passed
978 * only to that driver.
979 */
980 if (!driver) {
981 mutex_lock(&vfio.iommu_drivers_lock);
ae5515d6
AW
982 list_for_each_entry(driver, &vfio.iommu_drivers_list,
983 vfio_next) {
03a76b60
AW
984
985#ifdef CONFIG_VFIO_NOIOMMU
986 if (!list_empty(&container->group_list) &&
987 (container->noiommu !=
988 (driver->ops == &vfio_noiommu_ops)))
989 continue;
990#endif
991
cba3345c
AW
992 if (!try_module_get(driver->ops->owner))
993 continue;
994
995 ret = driver->ops->ioctl(NULL,
996 VFIO_CHECK_EXTENSION,
997 arg);
998 module_put(driver->ops->owner);
999 if (ret > 0)
1000 break;
1001 }
1002 mutex_unlock(&vfio.iommu_drivers_lock);
1003 } else
1004 ret = driver->ops->ioctl(container->iommu_data,
1005 VFIO_CHECK_EXTENSION, arg);
1006 }
1007
0b43c082
AW
1008 up_read(&container->group_lock);
1009
cba3345c
AW
1010 return ret;
1011}
1012
9587f44a 1013/* hold write lock on container->group_lock */
cba3345c
AW
1014static int __vfio_container_attach_groups(struct vfio_container *container,
1015 struct vfio_iommu_driver *driver,
1016 void *data)
1017{
1018 struct vfio_group *group;
1019 int ret = -ENODEV;
1020
1021 list_for_each_entry(group, &container->group_list, container_next) {
1022 ret = driver->ops->attach_group(data, group->iommu_group);
1023 if (ret)
1024 goto unwind;
1025 }
1026
1027 return ret;
1028
1029unwind:
1030 list_for_each_entry_continue_reverse(group, &container->group_list,
1031 container_next) {
1032 driver->ops->detach_group(data, group->iommu_group);
1033 }
1034
1035 return ret;
1036}
1037
1038static long vfio_ioctl_set_iommu(struct vfio_container *container,
1039 unsigned long arg)
1040{
1041 struct vfio_iommu_driver *driver;
1042 long ret = -ENODEV;
1043
9587f44a 1044 down_write(&container->group_lock);
cba3345c
AW
1045
1046 /*
1047 * The container is designed to be an unprivileged interface while
1048 * the group can be assigned to specific users. Therefore, only by
1049 * adding a group to a container does the user get the privilege of
1050 * enabling the iommu, which may allocate finite resources. There
1051 * is no unset_iommu, but by removing all the groups from a container,
1052 * the container is deprivileged and returns to an unset state.
1053 */
1054 if (list_empty(&container->group_list) || container->iommu_driver) {
9587f44a 1055 up_write(&container->group_lock);
cba3345c
AW
1056 return -EINVAL;
1057 }
1058
1059 mutex_lock(&vfio.iommu_drivers_lock);
ae5515d6 1060 list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
cba3345c
AW
1061 void *data;
1062
03a76b60
AW
1063#ifdef CONFIG_VFIO_NOIOMMU
1064 /*
1065 * Only noiommu containers can use vfio-noiommu and noiommu
1066 * containers can only use vfio-noiommu.
1067 */
1068 if (container->noiommu != (driver->ops == &vfio_noiommu_ops))
1069 continue;
1070#endif
1071
cba3345c
AW
1072 if (!try_module_get(driver->ops->owner))
1073 continue;
1074
1075 /*
1076 * The arg magic for SET_IOMMU is the same as CHECK_EXTENSION,
1077 * so test which iommu driver reported support for this
1078 * extension and call open on them. We also pass them the
1079 * magic, allowing a single driver to support multiple
1080 * interfaces if they'd like.
1081 */
1082 if (driver->ops->ioctl(NULL, VFIO_CHECK_EXTENSION, arg) <= 0) {
1083 module_put(driver->ops->owner);
1084 continue;
1085 }
1086
1087 /* module reference holds the driver we're working on */
1088 mutex_unlock(&vfio.iommu_drivers_lock);
1089
1090 data = driver->ops->open(arg);
1091 if (IS_ERR(data)) {
1092 ret = PTR_ERR(data);
1093 module_put(driver->ops->owner);
1094 goto skip_drivers_unlock;
1095 }
1096
1097 ret = __vfio_container_attach_groups(container, driver, data);
1098 if (!ret) {
1099 container->iommu_driver = driver;
1100 container->iommu_data = data;
1101 } else {
1102 driver->ops->release(data);
1103 module_put(driver->ops->owner);
1104 }
1105
1106 goto skip_drivers_unlock;
1107 }
1108
1109 mutex_unlock(&vfio.iommu_drivers_lock);
1110skip_drivers_unlock:
9587f44a 1111 up_write(&container->group_lock);
cba3345c
AW
1112
1113 return ret;
1114}
1115
1116static long vfio_fops_unl_ioctl(struct file *filep,
1117 unsigned int cmd, unsigned long arg)
1118{
1119 struct vfio_container *container = filep->private_data;
1120 struct vfio_iommu_driver *driver;
1121 void *data;
1122 long ret = -EINVAL;
1123
1124 if (!container)
1125 return ret;
1126
cba3345c
AW
1127 switch (cmd) {
1128 case VFIO_GET_API_VERSION:
1129 ret = VFIO_API_VERSION;
1130 break;
1131 case VFIO_CHECK_EXTENSION:
1132 ret = vfio_ioctl_check_extension(container, arg);
1133 break;
1134 case VFIO_SET_IOMMU:
1135 ret = vfio_ioctl_set_iommu(container, arg);
1136 break;
1137 default:
0b43c082
AW
1138 down_read(&container->group_lock);
1139
1140 driver = container->iommu_driver;
1141 data = container->iommu_data;
1142
cba3345c
AW
1143 if (driver) /* passthrough all unrecognized ioctls */
1144 ret = driver->ops->ioctl(data, cmd, arg);
0b43c082
AW
1145
1146 up_read(&container->group_lock);
cba3345c
AW
1147 }
1148
1149 return ret;
1150}
1151
1152#ifdef CONFIG_COMPAT
1153static long vfio_fops_compat_ioctl(struct file *filep,
1154 unsigned int cmd, unsigned long arg)
1155{
1156 arg = (unsigned long)compat_ptr(arg);
1157 return vfio_fops_unl_ioctl(filep, cmd, arg);
1158}
1159#endif /* CONFIG_COMPAT */
1160
1161static int vfio_fops_open(struct inode *inode, struct file *filep)
1162{
1163 struct vfio_container *container;
1164
1165 container = kzalloc(sizeof(*container), GFP_KERNEL);
1166 if (!container)
1167 return -ENOMEM;
1168
1169 INIT_LIST_HEAD(&container->group_list);
9587f44a 1170 init_rwsem(&container->group_lock);
cba3345c
AW
1171 kref_init(&container->kref);
1172
1173 filep->private_data = container;
1174
1175 return 0;
1176}
1177
1178static int vfio_fops_release(struct inode *inode, struct file *filep)
1179{
1180 struct vfio_container *container = filep->private_data;
1181
1182 filep->private_data = NULL;
1183
1184 vfio_container_put(container);
1185
1186 return 0;
1187}
1188
1189/*
1190 * Once an iommu driver is set, we optionally pass read/write/mmap
1191 * on to the driver, allowing management interfaces beyond ioctl.
1192 */
1193static ssize_t vfio_fops_read(struct file *filep, char __user *buf,
1194 size_t count, loff_t *ppos)
1195{
1196 struct vfio_container *container = filep->private_data;
0b43c082
AW
1197 struct vfio_iommu_driver *driver;
1198 ssize_t ret = -EINVAL;
cba3345c 1199
0b43c082
AW
1200 down_read(&container->group_lock);
1201
1202 driver = container->iommu_driver;
1203 if (likely(driver && driver->ops->read))
1204 ret = driver->ops->read(container->iommu_data,
1205 buf, count, ppos);
cba3345c 1206
0b43c082
AW
1207 up_read(&container->group_lock);
1208
1209 return ret;
cba3345c
AW
1210}
1211
1212static ssize_t vfio_fops_write(struct file *filep, const char __user *buf,
1213 size_t count, loff_t *ppos)
1214{
1215 struct vfio_container *container = filep->private_data;
0b43c082
AW
1216 struct vfio_iommu_driver *driver;
1217 ssize_t ret = -EINVAL;
cba3345c 1218
0b43c082
AW
1219 down_read(&container->group_lock);
1220
1221 driver = container->iommu_driver;
1222 if (likely(driver && driver->ops->write))
1223 ret = driver->ops->write(container->iommu_data,
1224 buf, count, ppos);
1225
1226 up_read(&container->group_lock);
cba3345c 1227
0b43c082 1228 return ret;
cba3345c
AW
1229}
1230
1231static int vfio_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1232{
1233 struct vfio_container *container = filep->private_data;
0b43c082
AW
1234 struct vfio_iommu_driver *driver;
1235 int ret = -EINVAL;
cba3345c 1236
0b43c082 1237 down_read(&container->group_lock);
cba3345c 1238
0b43c082
AW
1239 driver = container->iommu_driver;
1240 if (likely(driver && driver->ops->mmap))
1241 ret = driver->ops->mmap(container->iommu_data, vma);
1242
1243 up_read(&container->group_lock);
1244
1245 return ret;
cba3345c
AW
1246}
1247
1248static const struct file_operations vfio_fops = {
1249 .owner = THIS_MODULE,
1250 .open = vfio_fops_open,
1251 .release = vfio_fops_release,
1252 .read = vfio_fops_read,
1253 .write = vfio_fops_write,
1254 .unlocked_ioctl = vfio_fops_unl_ioctl,
1255#ifdef CONFIG_COMPAT
1256 .compat_ioctl = vfio_fops_compat_ioctl,
1257#endif
1258 .mmap = vfio_fops_mmap,
1259};
1260
1261/**
1262 * VFIO Group fd, /dev/vfio/$GROUP
1263 */
1264static void __vfio_group_unset_container(struct vfio_group *group)
1265{
1266 struct vfio_container *container = group->container;
1267 struct vfio_iommu_driver *driver;
1268
9587f44a 1269 down_write(&container->group_lock);
cba3345c
AW
1270
1271 driver = container->iommu_driver;
1272 if (driver)
1273 driver->ops->detach_group(container->iommu_data,
1274 group->iommu_group);
1275
1276 group->container = NULL;
1277 list_del(&group->container_next);
1278
1279 /* Detaching the last group deprivileges a container, remove iommu */
1280 if (driver && list_empty(&container->group_list)) {
1281 driver->ops->release(container->iommu_data);
1282 module_put(driver->ops->owner);
1283 container->iommu_driver = NULL;
1284 container->iommu_data = NULL;
1285 }
1286
9587f44a 1287 up_write(&container->group_lock);
cba3345c
AW
1288
1289 vfio_container_put(container);
1290}
1291
1292/*
1293 * VFIO_GROUP_UNSET_CONTAINER should fail if there are other users or
1294 * if there was no container to unset. Since the ioctl is called on
1295 * the group, we know that still exists, therefore the only valid
1296 * transition here is 1->0.
1297 */
1298static int vfio_group_unset_container(struct vfio_group *group)
1299{
1300 int users = atomic_cmpxchg(&group->container_users, 1, 0);
1301
1302 if (!users)
1303 return -EINVAL;
1304 if (users != 1)
1305 return -EBUSY;
1306
1307 __vfio_group_unset_container(group);
1308
1309 return 0;
1310}
1311
1312/*
1313 * When removing container users, anything that removes the last user
1314 * implicitly removes the group from the container. That is, if the
1315 * group file descriptor is closed, as well as any device file descriptors,
1316 * the group is free.
1317 */
1318static void vfio_group_try_dissolve_container(struct vfio_group *group)
1319{
1320 if (0 == atomic_dec_if_positive(&group->container_users))
1321 __vfio_group_unset_container(group);
1322}
1323
1324static int vfio_group_set_container(struct vfio_group *group, int container_fd)
1325{
2903ff01 1326 struct fd f;
cba3345c
AW
1327 struct vfio_container *container;
1328 struct vfio_iommu_driver *driver;
2903ff01 1329 int ret = 0;
cba3345c
AW
1330
1331 if (atomic_read(&group->container_users))
1332 return -EINVAL;
1333
03a76b60
AW
1334 if (group->noiommu && !capable(CAP_SYS_RAWIO))
1335 return -EPERM;
1336
2903ff01
AV
1337 f = fdget(container_fd);
1338 if (!f.file)
cba3345c
AW
1339 return -EBADF;
1340
1341 /* Sanity check, is this really our fd? */
2903ff01
AV
1342 if (f.file->f_op != &vfio_fops) {
1343 fdput(f);
cba3345c
AW
1344 return -EINVAL;
1345 }
1346
2903ff01 1347 container = f.file->private_data;
cba3345c
AW
1348 WARN_ON(!container); /* fget ensures we don't race vfio_release */
1349
9587f44a 1350 down_write(&container->group_lock);
cba3345c 1351
03a76b60
AW
1352 /* Real groups and fake groups cannot mix */
1353 if (!list_empty(&container->group_list) &&
1354 container->noiommu != group->noiommu) {
1355 ret = -EPERM;
1356 goto unlock_out;
1357 }
1358
cba3345c
AW
1359 driver = container->iommu_driver;
1360 if (driver) {
1361 ret = driver->ops->attach_group(container->iommu_data,
1362 group->iommu_group);
1363 if (ret)
1364 goto unlock_out;
1365 }
1366
1367 group->container = container;
03a76b60 1368 container->noiommu = group->noiommu;
cba3345c
AW
1369 list_add(&group->container_next, &container->group_list);
1370
1371 /* Get a reference on the container and mark a user within the group */
1372 vfio_container_get(container);
1373 atomic_inc(&group->container_users);
1374
1375unlock_out:
9587f44a 1376 up_write(&container->group_lock);
2903ff01 1377 fdput(f);
cba3345c
AW
1378 return ret;
1379}
1380
1381static bool vfio_group_viable(struct vfio_group *group)
1382{
1383 return (iommu_group_for_each_dev(group->iommu_group,
1384 group, vfio_dev_viable) == 0);
1385}
1386
1387static const struct file_operations vfio_device_fops;
1388
1389static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
1390{
1391 struct vfio_device *device;
1392 struct file *filep;
4bc94d5d 1393 int ret;
cba3345c
AW
1394
1395 if (0 == atomic_read(&group->container_users) ||
1396 !group->container->iommu_driver || !vfio_group_viable(group))
1397 return -EINVAL;
1398
03a76b60
AW
1399 if (group->noiommu && !capable(CAP_SYS_RAWIO))
1400 return -EPERM;
1401
4bc94d5d
AW
1402 device = vfio_device_get_from_name(group, buf);
1403 if (!device)
1404 return -ENODEV;
cba3345c 1405
4bc94d5d
AW
1406 ret = device->ops->open(device->device_data);
1407 if (ret) {
1408 vfio_device_put(device);
1409 return ret;
1410 }
cba3345c 1411
4bc94d5d
AW
1412 /*
1413 * We can't use anon_inode_getfd() because we need to modify
1414 * the f_mode flags directly to allow more than just ioctls
1415 */
1416 ret = get_unused_fd_flags(O_CLOEXEC);
1417 if (ret < 0) {
1418 device->ops->release(device->device_data);
1419 vfio_device_put(device);
1420 return ret;
1421 }
cba3345c 1422
4bc94d5d
AW
1423 filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops,
1424 device, O_RDWR);
1425 if (IS_ERR(filep)) {
1426 put_unused_fd(ret);
1427 ret = PTR_ERR(filep);
1428 device->ops->release(device->device_data);
1429 vfio_device_put(device);
1430 return ret;
1431 }
1432
1433 /*
1434 * TODO: add an anon_inode interface to do this.
1435 * Appears to be missing by lack of need rather than
1436 * explicitly prevented. Now there's need.
1437 */
1438 filep->f_mode |= (FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE);
cba3345c 1439
4bc94d5d 1440 atomic_inc(&group->container_users);
31605deb 1441
4bc94d5d 1442 fd_install(ret, filep);
cba3345c 1443
03a76b60
AW
1444 if (group->noiommu)
1445 dev_warn(device->dev, "vfio-noiommu device opened by user "
1446 "(%s:%d)\n", current->comm, task_pid_nr(current));
1447
cba3345c
AW
1448 return ret;
1449}
1450
1451static long vfio_group_fops_unl_ioctl(struct file *filep,
1452 unsigned int cmd, unsigned long arg)
1453{
1454 struct vfio_group *group = filep->private_data;
1455 long ret = -ENOTTY;
1456
1457 switch (cmd) {
1458 case VFIO_GROUP_GET_STATUS:
1459 {
1460 struct vfio_group_status status;
1461 unsigned long minsz;
1462
1463 minsz = offsetofend(struct vfio_group_status, flags);
1464
1465 if (copy_from_user(&status, (void __user *)arg, minsz))
1466 return -EFAULT;
1467
1468 if (status.argsz < minsz)
1469 return -EINVAL;
1470
1471 status.flags = 0;
1472
1473 if (vfio_group_viable(group))
1474 status.flags |= VFIO_GROUP_FLAGS_VIABLE;
1475
1476 if (group->container)
1477 status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET;
1478
1479 if (copy_to_user((void __user *)arg, &status, minsz))
1480 return -EFAULT;
1481
1482 ret = 0;
1483 break;
1484 }
1485 case VFIO_GROUP_SET_CONTAINER:
1486 {
1487 int fd;
1488
1489 if (get_user(fd, (int __user *)arg))
1490 return -EFAULT;
1491
1492 if (fd < 0)
1493 return -EINVAL;
1494
1495 ret = vfio_group_set_container(group, fd);
1496 break;
1497 }
1498 case VFIO_GROUP_UNSET_CONTAINER:
1499 ret = vfio_group_unset_container(group);
1500 break;
1501 case VFIO_GROUP_GET_DEVICE_FD:
1502 {
1503 char *buf;
1504
1505 buf = strndup_user((const char __user *)arg, PAGE_SIZE);
1506 if (IS_ERR(buf))
1507 return PTR_ERR(buf);
1508
1509 ret = vfio_group_get_device_fd(group, buf);
1510 kfree(buf);
1511 break;
1512 }
1513 }
1514
1515 return ret;
1516}
1517
1518#ifdef CONFIG_COMPAT
1519static long vfio_group_fops_compat_ioctl(struct file *filep,
1520 unsigned int cmd, unsigned long arg)
1521{
1522 arg = (unsigned long)compat_ptr(arg);
1523 return vfio_group_fops_unl_ioctl(filep, cmd, arg);
1524}
1525#endif /* CONFIG_COMPAT */
1526
1527static int vfio_group_fops_open(struct inode *inode, struct file *filep)
1528{
1529 struct vfio_group *group;
6d6768c6 1530 int opened;
cba3345c
AW
1531
1532 group = vfio_group_get_from_minor(iminor(inode));
1533 if (!group)
1534 return -ENODEV;
1535
03a76b60
AW
1536 if (group->noiommu && !capable(CAP_SYS_RAWIO)) {
1537 vfio_group_put(group);
1538 return -EPERM;
1539 }
1540
6d6768c6
AW
1541 /* Do we need multiple instances of the group open? Seems not. */
1542 opened = atomic_cmpxchg(&group->opened, 0, 1);
1543 if (opened) {
1544 vfio_group_put(group);
1545 return -EBUSY;
1546 }
1547
1548 /* Is something still in use from a previous open? */
cba3345c 1549 if (group->container) {
6d6768c6 1550 atomic_dec(&group->opened);
cba3345c
AW
1551 vfio_group_put(group);
1552 return -EBUSY;
1553 }
1554
1555 filep->private_data = group;
1556
1557 return 0;
1558}
1559
1560static int vfio_group_fops_release(struct inode *inode, struct file *filep)
1561{
1562 struct vfio_group *group = filep->private_data;
1563
1564 filep->private_data = NULL;
1565
1566 vfio_group_try_dissolve_container(group);
1567
6d6768c6
AW
1568 atomic_dec(&group->opened);
1569
cba3345c
AW
1570 vfio_group_put(group);
1571
1572 return 0;
1573}
1574
1575static const struct file_operations vfio_group_fops = {
1576 .owner = THIS_MODULE,
1577 .unlocked_ioctl = vfio_group_fops_unl_ioctl,
1578#ifdef CONFIG_COMPAT
1579 .compat_ioctl = vfio_group_fops_compat_ioctl,
1580#endif
1581 .open = vfio_group_fops_open,
1582 .release = vfio_group_fops_release,
1583};
1584
1585/**
1586 * VFIO Device fd
1587 */
1588static int vfio_device_fops_release(struct inode *inode, struct file *filep)
1589{
1590 struct vfio_device *device = filep->private_data;
1591
1592 device->ops->release(device->device_data);
1593
1594 vfio_group_try_dissolve_container(device->group);
1595
1596 vfio_device_put(device);
1597
1598 return 0;
1599}
1600
1601static long vfio_device_fops_unl_ioctl(struct file *filep,
1602 unsigned int cmd, unsigned long arg)
1603{
1604 struct vfio_device *device = filep->private_data;
1605
1606 if (unlikely(!device->ops->ioctl))
1607 return -EINVAL;
1608
1609 return device->ops->ioctl(device->device_data, cmd, arg);
1610}
1611
1612static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1613 size_t count, loff_t *ppos)
1614{
1615 struct vfio_device *device = filep->private_data;
1616
1617 if (unlikely(!device->ops->read))
1618 return -EINVAL;
1619
1620 return device->ops->read(device->device_data, buf, count, ppos);
1621}
1622
1623static ssize_t vfio_device_fops_write(struct file *filep,
1624 const char __user *buf,
1625 size_t count, loff_t *ppos)
1626{
1627 struct vfio_device *device = filep->private_data;
1628
1629 if (unlikely(!device->ops->write))
1630 return -EINVAL;
1631
1632 return device->ops->write(device->device_data, buf, count, ppos);
1633}
1634
1635static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1636{
1637 struct vfio_device *device = filep->private_data;
1638
1639 if (unlikely(!device->ops->mmap))
1640 return -EINVAL;
1641
1642 return device->ops->mmap(device->device_data, vma);
1643}
1644
1645#ifdef CONFIG_COMPAT
1646static long vfio_device_fops_compat_ioctl(struct file *filep,
1647 unsigned int cmd, unsigned long arg)
1648{
1649 arg = (unsigned long)compat_ptr(arg);
1650 return vfio_device_fops_unl_ioctl(filep, cmd, arg);
1651}
1652#endif /* CONFIG_COMPAT */
1653
1654static const struct file_operations vfio_device_fops = {
1655 .owner = THIS_MODULE,
1656 .release = vfio_device_fops_release,
1657 .read = vfio_device_fops_read,
1658 .write = vfio_device_fops_write,
1659 .unlocked_ioctl = vfio_device_fops_unl_ioctl,
1660#ifdef CONFIG_COMPAT
1661 .compat_ioctl = vfio_device_fops_compat_ioctl,
1662#endif
1663 .mmap = vfio_device_fops_mmap,
1664};
1665
6cdd9782
AK
1666/**
1667 * External user API, exported by symbols to be linked dynamically.
1668 *
1669 * The protocol includes:
1670 * 1. do normal VFIO init operation:
1671 * - opening a new container;
1672 * - attaching group(s) to it;
1673 * - setting an IOMMU driver for a container.
1674 * When IOMMU is set for a container, all groups in it are
1675 * considered ready to use by an external user.
1676 *
1677 * 2. User space passes a group fd to an external user.
1678 * The external user calls vfio_group_get_external_user()
1679 * to verify that:
1680 * - the group is initialized;
1681 * - IOMMU is set for it.
1682 * If both checks passed, vfio_group_get_external_user()
1683 * increments the container user counter to prevent
1684 * the VFIO group from disposal before KVM exits.
1685 *
1686 * 3. The external user calls vfio_external_user_iommu_id()
1687 * to know an IOMMU ID.
1688 *
1689 * 4. When the external KVM finishes, it calls
1690 * vfio_group_put_external_user() to release the VFIO group.
1691 * This call decrements the container user counter.
1692 */
1693struct vfio_group *vfio_group_get_external_user(struct file *filep)
1694{
1695 struct vfio_group *group = filep->private_data;
1696
1697 if (filep->f_op != &vfio_group_fops)
1698 return ERR_PTR(-EINVAL);
1699
1700 if (!atomic_inc_not_zero(&group->container_users))
1701 return ERR_PTR(-EINVAL);
1702
03a76b60
AW
1703 if (group->noiommu) {
1704 atomic_dec(&group->container_users);
1705 return ERR_PTR(-EPERM);
1706 }
1707
6cdd9782
AK
1708 if (!group->container->iommu_driver ||
1709 !vfio_group_viable(group)) {
1710 atomic_dec(&group->container_users);
1711 return ERR_PTR(-EINVAL);
1712 }
1713
1714 vfio_group_get(group);
1715
1716 return group;
1717}
1718EXPORT_SYMBOL_GPL(vfio_group_get_external_user);
1719
1720void vfio_group_put_external_user(struct vfio_group *group)
1721{
1722 vfio_group_put(group);
1723 vfio_group_try_dissolve_container(group);
1724}
1725EXPORT_SYMBOL_GPL(vfio_group_put_external_user);
1726
1727int vfio_external_user_iommu_id(struct vfio_group *group)
1728{
1729 return iommu_group_id(group->iommu_group);
1730}
1731EXPORT_SYMBOL_GPL(vfio_external_user_iommu_id);
1732
88d7ab89
AW
1733long vfio_external_check_extension(struct vfio_group *group, unsigned long arg)
1734{
1735 return vfio_ioctl_check_extension(group->container, arg);
1736}
1737EXPORT_SYMBOL_GPL(vfio_external_check_extension);
1738
cba3345c
AW
1739/**
1740 * Module/class support
1741 */
1742static char *vfio_devnode(struct device *dev, umode_t *mode)
1743{
1744 return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev));
1745}
1746
d1099901
AW
1747static struct miscdevice vfio_dev = {
1748 .minor = VFIO_MINOR,
1749 .name = "vfio",
1750 .fops = &vfio_fops,
1751 .nodename = "vfio/vfio",
1752 .mode = S_IRUGO | S_IWUGO,
1753};
1754
cba3345c
AW
1755static int __init vfio_init(void)
1756{
1757 int ret;
1758
1759 idr_init(&vfio.group_idr);
1760 mutex_init(&vfio.group_lock);
1761 mutex_init(&vfio.iommu_drivers_lock);
1762 INIT_LIST_HEAD(&vfio.group_list);
1763 INIT_LIST_HEAD(&vfio.iommu_drivers_list);
1764 init_waitqueue_head(&vfio.release_q);
1765
d1099901
AW
1766 ret = misc_register(&vfio_dev);
1767 if (ret) {
1768 pr_err("vfio: misc device register failed\n");
1769 return ret;
1770 }
1771
1772 /* /dev/vfio/$GROUP */
cba3345c
AW
1773 vfio.class = class_create(THIS_MODULE, "vfio");
1774 if (IS_ERR(vfio.class)) {
1775 ret = PTR_ERR(vfio.class);
1776 goto err_class;
1777 }
1778
1779 vfio.class->devnode = vfio_devnode;
1780
d1099901 1781 ret = alloc_chrdev_region(&vfio.group_devt, 0, MINORMASK, "vfio");
cba3345c 1782 if (ret)
d1099901 1783 goto err_alloc_chrdev;
cba3345c 1784
cba3345c 1785 cdev_init(&vfio.group_cdev, &vfio_group_fops);
d1099901 1786 ret = cdev_add(&vfio.group_cdev, vfio.group_devt, MINORMASK);
cba3345c 1787 if (ret)
d1099901 1788 goto err_cdev_add;
cba3345c
AW
1789
1790 pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
1791
73fa0d10
AW
1792 /*
1793 * Attempt to load known iommu-drivers. This gives us a working
1794 * environment without the user needing to explicitly load iommu
1795 * drivers.
1796 */
1797 request_module_nowait("vfio_iommu_type1");
5ffd229c 1798 request_module_nowait("vfio_iommu_spapr_tce");
73fa0d10 1799
03a76b60
AW
1800#ifdef CONFIG_VFIO_NOIOMMU
1801 vfio_register_iommu_driver(&vfio_noiommu_ops);
1802#endif
cba3345c
AW
1803 return 0;
1804
d1099901
AW
1805err_cdev_add:
1806 unregister_chrdev_region(vfio.group_devt, MINORMASK);
1807err_alloc_chrdev:
cba3345c
AW
1808 class_destroy(vfio.class);
1809 vfio.class = NULL;
1810err_class:
d1099901 1811 misc_deregister(&vfio_dev);
cba3345c
AW
1812 return ret;
1813}
1814
1815static void __exit vfio_cleanup(void)
1816{
1817 WARN_ON(!list_empty(&vfio.group_list));
1818
03a76b60
AW
1819#ifdef CONFIG_VFIO_NOIOMMU
1820 vfio_unregister_iommu_driver(&vfio_noiommu_ops);
1821#endif
cba3345c
AW
1822 idr_destroy(&vfio.group_idr);
1823 cdev_del(&vfio.group_cdev);
d1099901 1824 unregister_chrdev_region(vfio.group_devt, MINORMASK);
cba3345c
AW
1825 class_destroy(vfio.class);
1826 vfio.class = NULL;
d1099901 1827 misc_deregister(&vfio_dev);
cba3345c
AW
1828}
1829
1830module_init(vfio_init);
1831module_exit(vfio_cleanup);
1832
1833MODULE_VERSION(DRIVER_VERSION);
1834MODULE_LICENSE("GPL v2");
1835MODULE_AUTHOR(DRIVER_AUTHOR);
1836MODULE_DESCRIPTION(DRIVER_DESC);
d1099901
AW
1837MODULE_ALIAS_MISCDEV(VFIO_MINOR);
1838MODULE_ALIAS("devname:vfio/vfio");
This page took 0.240904 seconds and 5 git commands to generate.