1 /* Virtio ring implementation.
3 * Copyright 2007 Rusty Russell IBM Corporation
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 #include <linux/virtio.h>
20 #include <linux/virtio_ring.h>
21 #include <linux/virtio_config.h>
22 #include <linux/device.h>
23 #include <linux/slab.h>
24 #include <linux/module.h>
25 #include <linux/hrtimer.h>
26 #include <linux/kmemleak.h>
29 /* For development, we want to crash whenever the ring is screwed. */
30 #define BAD_RING(_vq, fmt, args...) \
32 dev_err(&(_vq)->vq.vdev->dev, \
33 "%s:"fmt, (_vq)->vq.name, ##args); \
36 /* Caller is supposed to guarantee no reentry. */
37 #define START_USE(_vq) \
40 panic("%s:in_use = %i\n", \
41 (_vq)->vq.name, (_vq)->in_use); \
42 (_vq)->in_use = __LINE__; \
44 #define END_USE(_vq) \
45 do { BUG_ON(!(_vq)->in_use); (_vq)->in_use = 0; } while(0)
47 #define BAD_RING(_vq, fmt, args...) \
49 dev_err(&_vq->vq.vdev->dev, \
50 "%s:"fmt, (_vq)->vq.name, ##args); \
51 (_vq)->broken = true; \
57 struct vring_virtqueue
61 /* Actual memory layout for this queue */
64 /* Can we use weak barriers? */
67 /* Other side has made a mess, don't try any more. */
70 /* Host supports indirect buffers */
73 /* Host publishes avail event idx */
76 /* Head of free buffer list. */
77 unsigned int free_head
;
78 /* Number we've added since last sync. */
79 unsigned int num_added
;
81 /* Last used index we've seen. */
84 /* How to notify other side. FIXME: commonalize hcalls! */
85 bool (*notify
)(struct virtqueue
*vq
);
88 /* They're supposed to lock for us. */
91 /* Figure out if their kicks are too delayed. */
92 bool last_add_time_valid
;
93 ktime_t last_add_time
;
96 /* Tokens for callbacks. */
100 #define to_vvq(_vq) container_of(_vq, struct vring_virtqueue, vq)
102 static struct vring_desc
*alloc_indirect(unsigned int total_sg
, gfp_t gfp
)
104 struct vring_desc
*desc
;
108 * We require lowmem mappings for the descriptors because
109 * otherwise virt_to_phys will give us bogus addresses in the
112 gfp
&= ~(__GFP_HIGHMEM
| __GFP_HIGH
);
114 desc
= kmalloc(total_sg
* sizeof(struct vring_desc
), gfp
);
118 for (i
= 0; i
< total_sg
; i
++)
123 static inline int virtqueue_add(struct virtqueue
*_vq
,
124 struct scatterlist
*sgs
[],
125 unsigned int total_sg
,
126 unsigned int out_sgs
,
131 struct vring_virtqueue
*vq
= to_vvq(_vq
);
132 struct scatterlist
*sg
;
133 struct vring_desc
*desc
;
134 unsigned int i
, n
, avail
, descs_used
, uninitialized_var(prev
);
140 BUG_ON(data
== NULL
);
142 if (unlikely(vq
->broken
)) {
149 ktime_t now
= ktime_get();
151 /* No kick or get, with .1 second between? Warn. */
152 if (vq
->last_add_time_valid
)
153 WARN_ON(ktime_to_ms(ktime_sub(now
, vq
->last_add_time
))
155 vq
->last_add_time
= now
;
156 vq
->last_add_time_valid
= true;
160 BUG_ON(total_sg
> vq
->vring
.num
);
161 BUG_ON(total_sg
== 0);
163 head
= vq
->free_head
;
165 /* If the host supports indirect descriptor tables, and we have multiple
166 * buffers, then go indirect. FIXME: tune this threshold */
167 if (vq
->indirect
&& total_sg
> 1 && vq
->vq
.num_free
)
168 desc
= alloc_indirect(total_sg
, gfp
);
173 /* Use a single buffer which doesn't continue */
174 vq
->vring
.desc
[head
].flags
= VRING_DESC_F_INDIRECT
;
175 vq
->vring
.desc
[head
].addr
= virt_to_phys(desc
);
176 /* avoid kmemleak false positive (hidden by virt_to_phys) */
177 kmemleak_ignore(desc
);
178 vq
->vring
.desc
[head
].len
= total_sg
* sizeof(struct vring_desc
);
180 /* Set up rest to use this indirect table. */
185 desc
= vq
->vring
.desc
;
187 descs_used
= total_sg
;
191 if (vq
->vq
.num_free
< descs_used
) {
192 pr_debug("Can't add buf len %i - avail = %i\n",
193 descs_used
, vq
->vq
.num_free
);
194 /* FIXME: for historical reasons, we force a notify here if
195 * there are outgoing parts to the buffer. Presumably the
196 * host should service the ring ASAP. */
203 /* We're about to use some buffers from the free list. */
204 vq
->vq
.num_free
-= descs_used
;
206 for (n
= 0; n
< out_sgs
; n
++) {
207 for (sg
= sgs
[n
]; sg
; sg
= sg_next(sg
)) {
208 desc
[i
].flags
= VRING_DESC_F_NEXT
;
209 desc
[i
].addr
= sg_phys(sg
);
210 desc
[i
].len
= sg
->length
;
215 for (; n
< (out_sgs
+ in_sgs
); n
++) {
216 for (sg
= sgs
[n
]; sg
; sg
= sg_next(sg
)) {
217 desc
[i
].flags
= VRING_DESC_F_NEXT
|VRING_DESC_F_WRITE
;
218 desc
[i
].addr
= sg_phys(sg
);
219 desc
[i
].len
= sg
->length
;
224 /* Last one doesn't continue. */
225 desc
[prev
].flags
&= ~VRING_DESC_F_NEXT
;
227 /* Update free pointer */
229 vq
->free_head
= vq
->vring
.desc
[head
].next
;
234 vq
->data
[head
] = data
;
236 /* Put entry in available array (but don't update avail->idx until they
238 avail
= (vq
->vring
.avail
->idx
& (vq
->vring
.num
-1));
239 vq
->vring
.avail
->ring
[avail
] = head
;
241 /* Descriptors and available array need to be set before we expose the
242 * new available array entries. */
243 virtio_wmb(vq
->weak_barriers
);
244 vq
->vring
.avail
->idx
++;
247 /* This is very unlikely, but theoretically possible. Kick
249 if (unlikely(vq
->num_added
== (1 << 16) - 1))
252 pr_debug("Added buffer head %i to %p\n", head
, vq
);
259 * virtqueue_add_sgs - expose buffers to other end
260 * @vq: the struct virtqueue we're talking about.
261 * @sgs: array of terminated scatterlists.
262 * @out_num: the number of scatterlists readable by other side
263 * @in_num: the number of scatterlists which are writable (after readable ones)
264 * @data: the token identifying the buffer.
265 * @gfp: how to do memory allocations (if necessary).
267 * Caller must ensure we don't call this with other virtqueue operations
268 * at the same time (except where noted).
270 * Returns zero or a negative error (ie. ENOSPC, ENOMEM, EIO).
272 int virtqueue_add_sgs(struct virtqueue
*_vq
,
273 struct scatterlist
*sgs
[],
274 unsigned int out_sgs
,
279 unsigned int i
, total_sg
= 0;
281 /* Count them first. */
282 for (i
= 0; i
< out_sgs
+ in_sgs
; i
++) {
283 struct scatterlist
*sg
;
284 for (sg
= sgs
[i
]; sg
; sg
= sg_next(sg
))
287 return virtqueue_add(_vq
, sgs
, total_sg
, out_sgs
, in_sgs
, data
, gfp
);
289 EXPORT_SYMBOL_GPL(virtqueue_add_sgs
);
292 * virtqueue_add_outbuf - expose output buffers to other end
293 * @vq: the struct virtqueue we're talking about.
294 * @sg: scatterlist (must be well-formed and terminated!)
295 * @num: the number of entries in @sg readable by other side
296 * @data: the token identifying the buffer.
297 * @gfp: how to do memory allocations (if necessary).
299 * Caller must ensure we don't call this with other virtqueue operations
300 * at the same time (except where noted).
302 * Returns zero or a negative error (ie. ENOSPC, ENOMEM, EIO).
304 int virtqueue_add_outbuf(struct virtqueue
*vq
,
305 struct scatterlist
*sg
, unsigned int num
,
309 return virtqueue_add(vq
, &sg
, num
, 1, 0, data
, gfp
);
311 EXPORT_SYMBOL_GPL(virtqueue_add_outbuf
);
314 * virtqueue_add_inbuf - expose input buffers to other end
315 * @vq: the struct virtqueue we're talking about.
316 * @sg: scatterlist (must be well-formed and terminated!)
317 * @num: the number of entries in @sg writable by other side
318 * @data: the token identifying the buffer.
319 * @gfp: how to do memory allocations (if necessary).
321 * Caller must ensure we don't call this with other virtqueue operations
322 * at the same time (except where noted).
324 * Returns zero or a negative error (ie. ENOSPC, ENOMEM, EIO).
326 int virtqueue_add_inbuf(struct virtqueue
*vq
,
327 struct scatterlist
*sg
, unsigned int num
,
331 return virtqueue_add(vq
, &sg
, num
, 0, 1, data
, gfp
);
333 EXPORT_SYMBOL_GPL(virtqueue_add_inbuf
);
336 * virtqueue_kick_prepare - first half of split virtqueue_kick call.
337 * @vq: the struct virtqueue
339 * Instead of virtqueue_kick(), you can do:
340 * if (virtqueue_kick_prepare(vq))
341 * virtqueue_notify(vq);
343 * This is sometimes useful because the virtqueue_kick_prepare() needs
344 * to be serialized, but the actual virtqueue_notify() call does not.
346 bool virtqueue_kick_prepare(struct virtqueue
*_vq
)
348 struct vring_virtqueue
*vq
= to_vvq(_vq
);
353 /* We need to expose available array entries before checking avail
355 virtio_mb(vq
->weak_barriers
);
357 old
= vq
->vring
.avail
->idx
- vq
->num_added
;
358 new = vq
->vring
.avail
->idx
;
362 if (vq
->last_add_time_valid
) {
363 WARN_ON(ktime_to_ms(ktime_sub(ktime_get(),
364 vq
->last_add_time
)) > 100);
366 vq
->last_add_time_valid
= false;
370 needs_kick
= vring_need_event(vring_avail_event(&vq
->vring
),
373 needs_kick
= !(vq
->vring
.used
->flags
& VRING_USED_F_NO_NOTIFY
);
378 EXPORT_SYMBOL_GPL(virtqueue_kick_prepare
);
381 * virtqueue_notify - second half of split virtqueue_kick call.
382 * @vq: the struct virtqueue
384 * This does not need to be serialized.
386 * Returns false if host notify failed or queue is broken, otherwise true.
388 bool virtqueue_notify(struct virtqueue
*_vq
)
390 struct vring_virtqueue
*vq
= to_vvq(_vq
);
392 if (unlikely(vq
->broken
))
395 /* Prod other side to tell it about changes. */
396 if (!vq
->notify(_vq
)) {
402 EXPORT_SYMBOL_GPL(virtqueue_notify
);
405 * virtqueue_kick - update after add_buf
406 * @vq: the struct virtqueue
408 * After one or more virtqueue_add_* calls, invoke this to kick
411 * Caller must ensure we don't call this with other virtqueue
412 * operations at the same time (except where noted).
414 * Returns false if kick failed, otherwise true.
416 bool virtqueue_kick(struct virtqueue
*vq
)
418 if (virtqueue_kick_prepare(vq
))
419 return virtqueue_notify(vq
);
422 EXPORT_SYMBOL_GPL(virtqueue_kick
);
424 static void detach_buf(struct vring_virtqueue
*vq
, unsigned int head
)
428 /* Clear data ptr. */
429 vq
->data
[head
] = NULL
;
431 /* Put back on free list: find end */
434 /* Free the indirect table */
435 if (vq
->vring
.desc
[i
].flags
& VRING_DESC_F_INDIRECT
)
436 kfree(phys_to_virt(vq
->vring
.desc
[i
].addr
));
438 while (vq
->vring
.desc
[i
].flags
& VRING_DESC_F_NEXT
) {
439 i
= vq
->vring
.desc
[i
].next
;
443 vq
->vring
.desc
[i
].next
= vq
->free_head
;
444 vq
->free_head
= head
;
445 /* Plus final descriptor */
449 static inline bool more_used(const struct vring_virtqueue
*vq
)
451 return vq
->last_used_idx
!= vq
->vring
.used
->idx
;
455 * virtqueue_get_buf - get the next used buffer
456 * @vq: the struct virtqueue we're talking about.
457 * @len: the length written into the buffer
459 * If the driver wrote data into the buffer, @len will be set to the
460 * amount written. This means you don't need to clear the buffer
461 * beforehand to ensure there's no data leakage in the case of short
464 * Caller must ensure we don't call this with other virtqueue
465 * operations at the same time (except where noted).
467 * Returns NULL if there are no used buffers, or the "data" token
468 * handed to virtqueue_add_*().
470 void *virtqueue_get_buf(struct virtqueue
*_vq
, unsigned int *len
)
472 struct vring_virtqueue
*vq
= to_vvq(_vq
);
479 if (unlikely(vq
->broken
)) {
484 if (!more_used(vq
)) {
485 pr_debug("No more buffers in queue\n");
490 /* Only get used array entries after they have been exposed by host. */
491 virtio_rmb(vq
->weak_barriers
);
493 last_used
= (vq
->last_used_idx
& (vq
->vring
.num
- 1));
494 i
= vq
->vring
.used
->ring
[last_used
].id
;
495 *len
= vq
->vring
.used
->ring
[last_used
].len
;
497 if (unlikely(i
>= vq
->vring
.num
)) {
498 BAD_RING(vq
, "id %u out of range\n", i
);
501 if (unlikely(!vq
->data
[i
])) {
502 BAD_RING(vq
, "id %u is not a head!\n", i
);
506 /* detach_buf clears data, so grab it now. */
510 /* If we expect an interrupt for the next entry, tell host
511 * by writing event index and flush out the write before
512 * the read in the next get_buf call. */
513 if (!(vq
->vring
.avail
->flags
& VRING_AVAIL_F_NO_INTERRUPT
)) {
514 vring_used_event(&vq
->vring
) = vq
->last_used_idx
;
515 virtio_mb(vq
->weak_barriers
);
519 vq
->last_add_time_valid
= false;
525 EXPORT_SYMBOL_GPL(virtqueue_get_buf
);
528 * virtqueue_disable_cb - disable callbacks
529 * @vq: the struct virtqueue we're talking about.
531 * Note that this is not necessarily synchronous, hence unreliable and only
532 * useful as an optimization.
534 * Unlike other operations, this need not be serialized.
536 void virtqueue_disable_cb(struct virtqueue
*_vq
)
538 struct vring_virtqueue
*vq
= to_vvq(_vq
);
540 vq
->vring
.avail
->flags
|= VRING_AVAIL_F_NO_INTERRUPT
;
542 EXPORT_SYMBOL_GPL(virtqueue_disable_cb
);
545 * virtqueue_enable_cb_prepare - restart callbacks after disable_cb
546 * @vq: the struct virtqueue we're talking about.
548 * This re-enables callbacks; it returns current queue state
549 * in an opaque unsigned value. This value should be later tested by
550 * virtqueue_poll, to detect a possible race between the driver checking for
551 * more work, and enabling callbacks.
553 * Caller must ensure we don't call this with other virtqueue
554 * operations at the same time (except where noted).
556 unsigned virtqueue_enable_cb_prepare(struct virtqueue
*_vq
)
558 struct vring_virtqueue
*vq
= to_vvq(_vq
);
563 /* We optimistically turn back on interrupts, then check if there was
565 /* Depending on the VIRTIO_RING_F_EVENT_IDX feature, we need to
566 * either clear the flags bit or point the event index at the next
567 * entry. Always do both to keep code simple. */
568 vq
->vring
.avail
->flags
&= ~VRING_AVAIL_F_NO_INTERRUPT
;
569 vring_used_event(&vq
->vring
) = last_used_idx
= vq
->last_used_idx
;
571 return last_used_idx
;
573 EXPORT_SYMBOL_GPL(virtqueue_enable_cb_prepare
);
576 * virtqueue_poll - query pending used buffers
577 * @vq: the struct virtqueue we're talking about.
578 * @last_used_idx: virtqueue state (from call to virtqueue_enable_cb_prepare).
580 * Returns "true" if there are pending used buffers in the queue.
582 * This does not need to be serialized.
584 bool virtqueue_poll(struct virtqueue
*_vq
, unsigned last_used_idx
)
586 struct vring_virtqueue
*vq
= to_vvq(_vq
);
588 virtio_mb(vq
->weak_barriers
);
589 return (u16
)last_used_idx
!= vq
->vring
.used
->idx
;
591 EXPORT_SYMBOL_GPL(virtqueue_poll
);
594 * virtqueue_enable_cb - restart callbacks after disable_cb.
595 * @vq: the struct virtqueue we're talking about.
597 * This re-enables callbacks; it returns "false" if there are pending
598 * buffers in the queue, to detect a possible race between the driver
599 * checking for more work, and enabling callbacks.
601 * Caller must ensure we don't call this with other virtqueue
602 * operations at the same time (except where noted).
604 bool virtqueue_enable_cb(struct virtqueue
*_vq
)
606 unsigned last_used_idx
= virtqueue_enable_cb_prepare(_vq
);
607 return !virtqueue_poll(_vq
, last_used_idx
);
609 EXPORT_SYMBOL_GPL(virtqueue_enable_cb
);
612 * virtqueue_enable_cb_delayed - restart callbacks after disable_cb.
613 * @vq: the struct virtqueue we're talking about.
615 * This re-enables callbacks but hints to the other side to delay
616 * interrupts until most of the available buffers have been processed;
617 * it returns "false" if there are many pending buffers in the queue,
618 * to detect a possible race between the driver checking for more work,
619 * and enabling callbacks.
621 * Caller must ensure we don't call this with other virtqueue
622 * operations at the same time (except where noted).
624 bool virtqueue_enable_cb_delayed(struct virtqueue
*_vq
)
626 struct vring_virtqueue
*vq
= to_vvq(_vq
);
631 /* We optimistically turn back on interrupts, then check if there was
633 /* Depending on the VIRTIO_RING_F_USED_EVENT_IDX feature, we need to
634 * either clear the flags bit or point the event index at the next
635 * entry. Always do both to keep code simple. */
636 vq
->vring
.avail
->flags
&= ~VRING_AVAIL_F_NO_INTERRUPT
;
637 /* TODO: tune this threshold */
638 bufs
= (u16
)(vq
->vring
.avail
->idx
- vq
->last_used_idx
) * 3 / 4;
639 vring_used_event(&vq
->vring
) = vq
->last_used_idx
+ bufs
;
640 virtio_mb(vq
->weak_barriers
);
641 if (unlikely((u16
)(vq
->vring
.used
->idx
- vq
->last_used_idx
) > bufs
)) {
649 EXPORT_SYMBOL_GPL(virtqueue_enable_cb_delayed
);
652 * virtqueue_detach_unused_buf - detach first unused buffer
653 * @vq: the struct virtqueue we're talking about.
655 * Returns NULL or the "data" token handed to virtqueue_add_*().
656 * This is not valid on an active queue; it is useful only for device
659 void *virtqueue_detach_unused_buf(struct virtqueue
*_vq
)
661 struct vring_virtqueue
*vq
= to_vvq(_vq
);
667 for (i
= 0; i
< vq
->vring
.num
; i
++) {
670 /* detach_buf clears data, so grab it now. */
673 vq
->vring
.avail
->idx
--;
677 /* That should have freed everything. */
678 BUG_ON(vq
->vq
.num_free
!= vq
->vring
.num
);
683 EXPORT_SYMBOL_GPL(virtqueue_detach_unused_buf
);
685 irqreturn_t
vring_interrupt(int irq
, void *_vq
)
687 struct vring_virtqueue
*vq
= to_vvq(_vq
);
689 if (!more_used(vq
)) {
690 pr_debug("virtqueue interrupt with no work for %p\n", vq
);
694 if (unlikely(vq
->broken
))
697 pr_debug("virtqueue callback for %p (%p)\n", vq
, vq
->vq
.callback
);
699 vq
->vq
.callback(&vq
->vq
);
703 EXPORT_SYMBOL_GPL(vring_interrupt
);
705 struct virtqueue
*vring_new_virtqueue(unsigned int index
,
707 unsigned int vring_align
,
708 struct virtio_device
*vdev
,
711 bool (*notify
)(struct virtqueue
*),
712 void (*callback
)(struct virtqueue
*),
715 struct vring_virtqueue
*vq
;
718 /* We assume num is a power of 2. */
719 if (num
& (num
- 1)) {
720 dev_warn(&vdev
->dev
, "Bad virtqueue length %u\n", num
);
724 vq
= kmalloc(sizeof(*vq
) + sizeof(void *)*num
, GFP_KERNEL
);
728 vring_init(&vq
->vring
, num
, pages
, vring_align
);
729 vq
->vq
.callback
= callback
;
732 vq
->vq
.num_free
= num
;
733 vq
->vq
.index
= index
;
735 vq
->weak_barriers
= weak_barriers
;
737 vq
->last_used_idx
= 0;
739 list_add_tail(&vq
->vq
.list
, &vdev
->vqs
);
742 vq
->last_add_time_valid
= false;
745 vq
->indirect
= virtio_has_feature(vdev
, VIRTIO_RING_F_INDIRECT_DESC
);
746 vq
->event
= virtio_has_feature(vdev
, VIRTIO_RING_F_EVENT_IDX
);
748 /* No callback? Tell other side not to bother us. */
750 vq
->vring
.avail
->flags
|= VRING_AVAIL_F_NO_INTERRUPT
;
752 /* Put everything in free lists. */
754 for (i
= 0; i
< num
-1; i
++) {
755 vq
->vring
.desc
[i
].next
= i
+1;
762 EXPORT_SYMBOL_GPL(vring_new_virtqueue
);
764 void vring_del_virtqueue(struct virtqueue
*vq
)
769 EXPORT_SYMBOL_GPL(vring_del_virtqueue
);
771 /* Manipulates transport-specific feature bits. */
772 void vring_transport_features(struct virtio_device
*vdev
)
776 for (i
= VIRTIO_TRANSPORT_F_START
; i
< VIRTIO_TRANSPORT_F_END
; i
++) {
778 case VIRTIO_RING_F_INDIRECT_DESC
:
780 case VIRTIO_RING_F_EVENT_IDX
:
783 /* We don't understand this bit. */
784 clear_bit(i
, vdev
->features
);
788 EXPORT_SYMBOL_GPL(vring_transport_features
);
791 * virtqueue_get_vring_size - return the size of the virtqueue's vring
792 * @vq: the struct virtqueue containing the vring of interest.
794 * Returns the size of the vring. This is mainly used for boasting to
795 * userspace. Unlike other operations, this need not be serialized.
797 unsigned int virtqueue_get_vring_size(struct virtqueue
*_vq
)
800 struct vring_virtqueue
*vq
= to_vvq(_vq
);
802 return vq
->vring
.num
;
804 EXPORT_SYMBOL_GPL(virtqueue_get_vring_size
);
806 bool virtqueue_is_broken(struct virtqueue
*_vq
)
808 struct vring_virtqueue
*vq
= to_vvq(_vq
);
812 EXPORT_SYMBOL_GPL(virtqueue_is_broken
);
815 * This should prevent the device from being used, allowing drivers to
816 * recover. You may need to grab appropriate locks to flush.
818 void virtio_break_device(struct virtio_device
*dev
)
820 struct virtqueue
*_vq
;
822 list_for_each_entry(_vq
, &dev
->vqs
, list
) {
823 struct vring_virtqueue
*vq
= to_vvq(_vq
);
827 EXPORT_SYMBOL_GPL(virtio_break_device
);
829 MODULE_LICENSE("GPL");