2 * Back-end of the driver for virtual network devices. This portion of the
3 * driver exports a 'unified' network-device interface that can be accessed
4 * by any operating system that implements a compatible front end. A
5 * reference front-end implementation can be found in:
6 * drivers/net/xen-netfront.c
8 * Copyright (c) 2002-2005, K A Fraser
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License version 2
12 * as published by the Free Software Foundation; or, when distributed
13 * separately from the Linux kernel or incorporated into other
14 * software packages, subject to the following license:
16 * Permission is hereby granted, free of charge, to any person obtaining a copy
17 * of this source file (the "Software"), to deal in the Software without
18 * restriction, including without limitation the rights to use, copy, modify,
19 * merge, publish, distribute, sublicense, and/or sell copies of the Software,
20 * and to permit persons to whom the Software is furnished to do so, subject to
21 * the following conditions:
23 * The above copyright notice and this permission notice shall be included in
24 * all copies or substantial portions of the Software.
26 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
27 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
28 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
29 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
30 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
31 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
37 #include <linux/kthread.h>
38 #include <linux/if_vlan.h>
39 #include <linux/udp.h>
40 #include <linux/highmem.h>
45 #include <xen/events.h>
46 #include <xen/interface/memory.h>
48 #include <asm/xen/hypercall.h>
49 #include <asm/xen/page.h>
51 /* Provide an option to disable split event channels at load time as
52 * event channels are limited resource. Split event channels are
55 bool separate_tx_rx_irq
= 1;
56 module_param(separate_tx_rx_irq
, bool, 0644);
58 /* When guest ring is filled up, qdisc queues the packets for us, but we have
59 * to timeout them, otherwise other guests' packets can get stuck there
61 unsigned int rx_drain_timeout_msecs
= 10000;
62 module_param(rx_drain_timeout_msecs
, uint
, 0444);
63 unsigned int rx_drain_timeout_jiffies
;
65 unsigned int xenvif_max_queues
;
66 module_param_named(max_queues
, xenvif_max_queues
, uint
, 0644);
67 MODULE_PARM_DESC(max_queues
,
68 "Maximum number of queues per virtual interface");
71 * This is the maximum slots a skb can have. If a guest sends a skb
72 * which exceeds this limit it is considered malicious.
74 #define FATAL_SKB_SLOTS_DEFAULT 20
75 static unsigned int fatal_skb_slots
= FATAL_SKB_SLOTS_DEFAULT
;
76 module_param(fatal_skb_slots
, uint
, 0444);
78 static void xenvif_idx_release(struct xenvif_queue
*queue
, u16 pending_idx
,
81 static void make_tx_response(struct xenvif_queue
*queue
,
82 struct xen_netif_tx_request
*txp
,
85 static inline int tx_work_todo(struct xenvif_queue
*queue
);
86 static inline int rx_work_todo(struct xenvif_queue
*queue
);
88 static struct xen_netif_rx_response
*make_rx_response(struct xenvif_queue
*queue
,
95 static inline unsigned long idx_to_pfn(struct xenvif_queue
*queue
,
98 return page_to_pfn(queue
->mmap_pages
[idx
]);
101 static inline unsigned long idx_to_kaddr(struct xenvif_queue
*queue
,
104 return (unsigned long)pfn_to_kaddr(idx_to_pfn(queue
, idx
));
107 #define callback_param(vif, pending_idx) \
108 (vif->pending_tx_info[pending_idx].callback_struct)
110 /* Find the containing VIF's structure from a pointer in pending_tx_info array
112 static inline struct xenvif_queue
*ubuf_to_queue(const struct ubuf_info
*ubuf
)
114 u16 pending_idx
= ubuf
->desc
;
115 struct pending_tx_info
*temp
=
116 container_of(ubuf
, struct pending_tx_info
, callback_struct
);
117 return container_of(temp
- pending_idx
,
122 /* This is a miniumum size for the linear area to avoid lots of
123 * calls to __pskb_pull_tail() as we set up checksum offsets. The
124 * value 128 was chosen as it covers all IPv4 and most likely
127 #define PKT_PROT_LEN 128
129 static u16
frag_get_pending_idx(skb_frag_t
*frag
)
131 return (u16
)frag
->page_offset
;
134 static void frag_set_pending_idx(skb_frag_t
*frag
, u16 pending_idx
)
136 frag
->page_offset
= pending_idx
;
139 static inline pending_ring_idx_t
pending_index(unsigned i
)
141 return i
& (MAX_PENDING_REQS
-1);
144 bool xenvif_rx_ring_slots_available(struct xenvif_queue
*queue
, int needed
)
149 prod
= queue
->rx
.sring
->req_prod
;
150 cons
= queue
->rx
.req_cons
;
152 if (prod
- cons
>= needed
)
155 queue
->rx
.sring
->req_event
= prod
+ 1;
157 /* Make sure event is visible before we check prod
161 } while (queue
->rx
.sring
->req_prod
!= prod
);
167 * Returns true if we should start a new receive buffer instead of
168 * adding 'size' bytes to a buffer which currently contains 'offset'
171 static bool start_new_rx_buffer(int offset
, unsigned long size
, int head
)
173 /* simple case: we have completely filled the current buffer. */
174 if (offset
== MAX_BUFFER_OFFSET
)
178 * complex case: start a fresh buffer if the current frag
179 * would overflow the current buffer but only if:
180 * (i) this frag would fit completely in the next buffer
181 * and (ii) there is already some data in the current buffer
182 * and (iii) this is not the head buffer.
185 * - (i) stops us splitting a frag into two copies
186 * unless the frag is too large for a single buffer.
187 * - (ii) stops us from leaving a buffer pointlessly empty.
188 * - (iii) stops us leaving the first buffer
189 * empty. Strictly speaking this is already covered
190 * by (ii) but is explicitly checked because
191 * netfront relies on the first buffer being
192 * non-empty and can crash otherwise.
194 * This means we will effectively linearise small
195 * frags but do not needlessly split large buffers
196 * into multiple copies tend to give large frags their
197 * own buffers as before.
199 BUG_ON(size
> MAX_BUFFER_OFFSET
);
200 if ((offset
+ size
> MAX_BUFFER_OFFSET
) && offset
&& !head
)
206 struct netrx_pending_operations
{
207 unsigned copy_prod
, copy_cons
;
208 unsigned meta_prod
, meta_cons
;
209 struct gnttab_copy
*copy
;
210 struct xenvif_rx_meta
*meta
;
212 grant_ref_t copy_gref
;
215 static struct xenvif_rx_meta
*get_next_rx_buffer(struct xenvif_queue
*queue
,
216 struct netrx_pending_operations
*npo
)
218 struct xenvif_rx_meta
*meta
;
219 struct xen_netif_rx_request
*req
;
221 req
= RING_GET_REQUEST(&queue
->rx
, queue
->rx
.req_cons
++);
223 meta
= npo
->meta
+ npo
->meta_prod
++;
224 meta
->gso_type
= XEN_NETIF_GSO_TYPE_NONE
;
230 npo
->copy_gref
= req
->gref
;
236 * Set up the grant operations for this fragment. If it's a flipping
237 * interface, we also set up the unmap request from here.
239 static void xenvif_gop_frag_copy(struct xenvif_queue
*queue
, struct sk_buff
*skb
,
240 struct netrx_pending_operations
*npo
,
241 struct page
*page
, unsigned long size
,
242 unsigned long offset
, int *head
,
243 struct xenvif_queue
*foreign_queue
,
244 grant_ref_t foreign_gref
)
246 struct gnttab_copy
*copy_gop
;
247 struct xenvif_rx_meta
*meta
;
249 int gso_type
= XEN_NETIF_GSO_TYPE_NONE
;
251 /* Data must not cross a page boundary. */
252 BUG_ON(size
+ offset
> PAGE_SIZE
<<compound_order(page
));
254 meta
= npo
->meta
+ npo
->meta_prod
- 1;
256 /* Skip unused frames from start of page */
257 page
+= offset
>> PAGE_SHIFT
;
258 offset
&= ~PAGE_MASK
;
261 BUG_ON(offset
>= PAGE_SIZE
);
262 BUG_ON(npo
->copy_off
> MAX_BUFFER_OFFSET
);
264 bytes
= PAGE_SIZE
- offset
;
269 if (start_new_rx_buffer(npo
->copy_off
, bytes
, *head
)) {
271 * Netfront requires there to be some data in the head
276 meta
= get_next_rx_buffer(queue
, npo
);
279 if (npo
->copy_off
+ bytes
> MAX_BUFFER_OFFSET
)
280 bytes
= MAX_BUFFER_OFFSET
- npo
->copy_off
;
282 copy_gop
= npo
->copy
+ npo
->copy_prod
++;
283 copy_gop
->flags
= GNTCOPY_dest_gref
;
284 copy_gop
->len
= bytes
;
287 copy_gop
->source
.domid
= foreign_queue
->vif
->domid
;
288 copy_gop
->source
.u
.ref
= foreign_gref
;
289 copy_gop
->flags
|= GNTCOPY_source_gref
;
291 copy_gop
->source
.domid
= DOMID_SELF
;
292 copy_gop
->source
.u
.gmfn
=
293 virt_to_mfn(page_address(page
));
295 copy_gop
->source
.offset
= offset
;
297 copy_gop
->dest
.domid
= queue
->vif
->domid
;
298 copy_gop
->dest
.offset
= npo
->copy_off
;
299 copy_gop
->dest
.u
.ref
= npo
->copy_gref
;
301 npo
->copy_off
+= bytes
;
308 if (offset
== PAGE_SIZE
&& size
) {
309 BUG_ON(!PageCompound(page
));
314 /* Leave a gap for the GSO descriptor. */
315 if (skb_is_gso(skb
)) {
316 if (skb_shinfo(skb
)->gso_type
& SKB_GSO_TCPV4
)
317 gso_type
= XEN_NETIF_GSO_TYPE_TCPV4
;
318 else if (skb_shinfo(skb
)->gso_type
& SKB_GSO_TCPV6
)
319 gso_type
= XEN_NETIF_GSO_TYPE_TCPV6
;
322 if (*head
&& ((1 << gso_type
) & queue
->vif
->gso_mask
))
323 queue
->rx
.req_cons
++;
325 *head
= 0; /* There must be something in this buffer now. */
331 * Find the grant ref for a given frag in a chain of struct ubuf_info's
332 * skb: the skb itself
333 * i: the frag's number
334 * ubuf: a pointer to an element in the chain. It should not be NULL
336 * Returns a pointer to the element in the chain where the page were found. If
337 * not found, returns NULL.
338 * See the definition of callback_struct in common.h for more details about
341 static const struct ubuf_info
*xenvif_find_gref(const struct sk_buff
*const skb
,
343 const struct ubuf_info
*ubuf
)
345 struct xenvif_queue
*foreign_queue
= ubuf_to_queue(ubuf
);
348 u16 pending_idx
= ubuf
->desc
;
350 if (skb_shinfo(skb
)->frags
[i
].page
.p
==
351 foreign_queue
->mmap_pages
[pending_idx
])
353 ubuf
= (struct ubuf_info
*) ubuf
->ctx
;
360 * Prepare an SKB to be transmitted to the frontend.
362 * This function is responsible for allocating grant operations, meta
365 * It returns the number of meta structures consumed. The number of
366 * ring slots used is always equal to the number of meta slots used
367 * plus the number of GSO descriptors used. Currently, we use either
368 * zero GSO descriptors (for non-GSO packets) or one descriptor (for
369 * frontend-side LRO).
371 static int xenvif_gop_skb(struct sk_buff
*skb
,
372 struct netrx_pending_operations
*npo
,
373 struct xenvif_queue
*queue
)
375 struct xenvif
*vif
= netdev_priv(skb
->dev
);
376 int nr_frags
= skb_shinfo(skb
)->nr_frags
;
378 struct xen_netif_rx_request
*req
;
379 struct xenvif_rx_meta
*meta
;
384 const struct ubuf_info
*ubuf
= skb_shinfo(skb
)->destructor_arg
;
385 const struct ubuf_info
*const head_ubuf
= ubuf
;
387 old_meta_prod
= npo
->meta_prod
;
389 gso_type
= XEN_NETIF_GSO_TYPE_NONE
;
390 if (skb_is_gso(skb
)) {
391 if (skb_shinfo(skb
)->gso_type
& SKB_GSO_TCPV4
)
392 gso_type
= XEN_NETIF_GSO_TYPE_TCPV4
;
393 else if (skb_shinfo(skb
)->gso_type
& SKB_GSO_TCPV6
)
394 gso_type
= XEN_NETIF_GSO_TYPE_TCPV6
;
397 /* Set up a GSO prefix descriptor, if necessary */
398 if ((1 << gso_type
) & vif
->gso_prefix_mask
) {
399 req
= RING_GET_REQUEST(&queue
->rx
, queue
->rx
.req_cons
++);
400 meta
= npo
->meta
+ npo
->meta_prod
++;
401 meta
->gso_type
= gso_type
;
402 meta
->gso_size
= skb_shinfo(skb
)->gso_size
;
407 req
= RING_GET_REQUEST(&queue
->rx
, queue
->rx
.req_cons
++);
408 meta
= npo
->meta
+ npo
->meta_prod
++;
410 if ((1 << gso_type
) & vif
->gso_mask
) {
411 meta
->gso_type
= gso_type
;
412 meta
->gso_size
= skb_shinfo(skb
)->gso_size
;
414 meta
->gso_type
= XEN_NETIF_GSO_TYPE_NONE
;
421 npo
->copy_gref
= req
->gref
;
424 while (data
< skb_tail_pointer(skb
)) {
425 unsigned int offset
= offset_in_page(data
);
426 unsigned int len
= PAGE_SIZE
- offset
;
428 if (data
+ len
> skb_tail_pointer(skb
))
429 len
= skb_tail_pointer(skb
) - data
;
431 xenvif_gop_frag_copy(queue
, skb
, npo
,
432 virt_to_page(data
), len
, offset
, &head
,
438 for (i
= 0; i
< nr_frags
; i
++) {
439 /* This variable also signals whether foreign_gref has a real
442 struct xenvif_queue
*foreign_queue
= NULL
;
443 grant_ref_t foreign_gref
;
445 if ((skb_shinfo(skb
)->tx_flags
& SKBTX_DEV_ZEROCOPY
) &&
446 (ubuf
->callback
== &xenvif_zerocopy_callback
)) {
447 const struct ubuf_info
*const startpoint
= ubuf
;
449 /* Ideally ubuf points to the chain element which
450 * belongs to this frag. Or if frags were removed from
451 * the beginning, then shortly before it.
453 ubuf
= xenvif_find_gref(skb
, i
, ubuf
);
455 /* Try again from the beginning of the list, if we
456 * haven't tried from there. This only makes sense in
457 * the unlikely event of reordering the original frags.
458 * For injected local pages it's an unnecessary second
461 if (unlikely(!ubuf
) && startpoint
!= head_ubuf
)
462 ubuf
= xenvif_find_gref(skb
, i
, head_ubuf
);
465 u16 pending_idx
= ubuf
->desc
;
467 foreign_queue
= ubuf_to_queue(ubuf
);
469 foreign_queue
->pending_tx_info
[pending_idx
].req
.gref
;
470 /* Just a safety measure. If this was the last
471 * element on the list, the for loop will
472 * iterate again if a local page were added to
473 * the end. Using head_ubuf here prevents the
474 * second search on the chain. Or the original
475 * frags changed order, but that's less likely.
476 * In any way, ubuf shouldn't be NULL.
479 (struct ubuf_info
*) ubuf
->ctx
:
482 /* This frag was a local page, added to the
483 * array after the skb left netback.
487 xenvif_gop_frag_copy(queue
, skb
, npo
,
488 skb_frag_page(&skb_shinfo(skb
)->frags
[i
]),
489 skb_frag_size(&skb_shinfo(skb
)->frags
[i
]),
490 skb_shinfo(skb
)->frags
[i
].page_offset
,
493 foreign_queue
? foreign_gref
: UINT_MAX
);
496 return npo
->meta_prod
- old_meta_prod
;
500 * This is a twin to xenvif_gop_skb. Assume that xenvif_gop_skb was
501 * used to set up the operations on the top of
502 * netrx_pending_operations, which have since been done. Check that
503 * they didn't give any errors and advance over them.
505 static int xenvif_check_gop(struct xenvif
*vif
, int nr_meta_slots
,
506 struct netrx_pending_operations
*npo
)
508 struct gnttab_copy
*copy_op
;
509 int status
= XEN_NETIF_RSP_OKAY
;
512 for (i
= 0; i
< nr_meta_slots
; i
++) {
513 copy_op
= npo
->copy
+ npo
->copy_cons
++;
514 if (copy_op
->status
!= GNTST_okay
) {
516 "Bad status %d from copy to DOM%d.\n",
517 copy_op
->status
, vif
->domid
);
518 status
= XEN_NETIF_RSP_ERROR
;
525 static void xenvif_add_frag_responses(struct xenvif_queue
*queue
, int status
,
526 struct xenvif_rx_meta
*meta
,
530 unsigned long offset
;
532 /* No fragments used */
533 if (nr_meta_slots
<= 1)
538 for (i
= 0; i
< nr_meta_slots
; i
++) {
540 if (i
== nr_meta_slots
- 1)
543 flags
= XEN_NETRXF_more_data
;
546 make_rx_response(queue
, meta
[i
].id
, status
, offset
,
547 meta
[i
].size
, flags
);
551 struct xenvif_rx_cb
{
555 #define XENVIF_RX_CB(skb) ((struct xenvif_rx_cb *)(skb)->cb)
557 void xenvif_kick_thread(struct xenvif_queue
*queue
)
562 static void xenvif_rx_action(struct xenvif_queue
*queue
)
566 struct xen_netif_rx_response
*resp
;
567 struct sk_buff_head rxq
;
571 unsigned long offset
;
572 bool need_to_notify
= false;
574 struct netrx_pending_operations npo
= {
575 .copy
= queue
->grant_copy_op
,
579 skb_queue_head_init(&rxq
);
581 while ((skb
= skb_dequeue(&queue
->rx_queue
)) != NULL
) {
582 RING_IDX max_slots_needed
;
583 RING_IDX old_req_cons
;
584 RING_IDX ring_slots_used
;
587 /* We need a cheap worse case estimate for the number of
591 max_slots_needed
= DIV_ROUND_UP(offset_in_page(skb
->data
) +
594 for (i
= 0; i
< skb_shinfo(skb
)->nr_frags
; i
++) {
598 size
= skb_frag_size(&skb_shinfo(skb
)->frags
[i
]);
599 offset
= skb_shinfo(skb
)->frags
[i
].page_offset
;
601 /* For a worse-case estimate we need to factor in
602 * the fragment page offset as this will affect the
603 * number of times xenvif_gop_frag_copy() will
604 * call start_new_rx_buffer().
606 max_slots_needed
+= DIV_ROUND_UP(offset
+ size
,
610 /* To avoid the estimate becoming too pessimal for some
611 * frontends that limit posted rx requests, cap the estimate
614 if (max_slots_needed
> MAX_SKB_FRAGS
)
615 max_slots_needed
= MAX_SKB_FRAGS
;
617 /* We may need one more slot for GSO metadata */
618 if (skb_is_gso(skb
) &&
619 (skb_shinfo(skb
)->gso_type
& SKB_GSO_TCPV4
||
620 skb_shinfo(skb
)->gso_type
& SKB_GSO_TCPV6
))
623 /* If the skb may not fit then bail out now */
624 if (!xenvif_rx_ring_slots_available(queue
, max_slots_needed
)) {
625 skb_queue_head(&queue
->rx_queue
, skb
);
626 need_to_notify
= true;
627 queue
->rx_last_skb_slots
= max_slots_needed
;
630 queue
->rx_last_skb_slots
= 0;
632 old_req_cons
= queue
->rx
.req_cons
;
633 XENVIF_RX_CB(skb
)->meta_slots_used
= xenvif_gop_skb(skb
, &npo
, queue
);
634 ring_slots_used
= queue
->rx
.req_cons
- old_req_cons
;
636 BUG_ON(ring_slots_used
> max_slots_needed
);
638 __skb_queue_tail(&rxq
, skb
);
641 BUG_ON(npo
.meta_prod
> ARRAY_SIZE(queue
->meta
));
646 BUG_ON(npo
.copy_prod
> MAX_GRANT_COPY_OPS
);
647 gnttab_batch_copy(queue
->grant_copy_op
, npo
.copy_prod
);
649 while ((skb
= __skb_dequeue(&rxq
)) != NULL
) {
651 if ((1 << queue
->meta
[npo
.meta_cons
].gso_type
) &
652 queue
->vif
->gso_prefix_mask
) {
653 resp
= RING_GET_RESPONSE(&queue
->rx
,
654 queue
->rx
.rsp_prod_pvt
++);
656 resp
->flags
= XEN_NETRXF_gso_prefix
| XEN_NETRXF_more_data
;
658 resp
->offset
= queue
->meta
[npo
.meta_cons
].gso_size
;
659 resp
->id
= queue
->meta
[npo
.meta_cons
].id
;
660 resp
->status
= XENVIF_RX_CB(skb
)->meta_slots_used
;
663 XENVIF_RX_CB(skb
)->meta_slots_used
--;
667 queue
->stats
.tx_bytes
+= skb
->len
;
668 queue
->stats
.tx_packets
++;
670 status
= xenvif_check_gop(queue
->vif
,
671 XENVIF_RX_CB(skb
)->meta_slots_used
,
674 if (XENVIF_RX_CB(skb
)->meta_slots_used
== 1)
677 flags
= XEN_NETRXF_more_data
;
679 if (skb
->ip_summed
== CHECKSUM_PARTIAL
) /* local packet? */
680 flags
|= XEN_NETRXF_csum_blank
| XEN_NETRXF_data_validated
;
681 else if (skb
->ip_summed
== CHECKSUM_UNNECESSARY
)
682 /* remote but checksummed. */
683 flags
|= XEN_NETRXF_data_validated
;
686 resp
= make_rx_response(queue
, queue
->meta
[npo
.meta_cons
].id
,
688 queue
->meta
[npo
.meta_cons
].size
,
691 if ((1 << queue
->meta
[npo
.meta_cons
].gso_type
) &
692 queue
->vif
->gso_mask
) {
693 struct xen_netif_extra_info
*gso
=
694 (struct xen_netif_extra_info
*)
695 RING_GET_RESPONSE(&queue
->rx
,
696 queue
->rx
.rsp_prod_pvt
++);
698 resp
->flags
|= XEN_NETRXF_extra_info
;
700 gso
->u
.gso
.type
= queue
->meta
[npo
.meta_cons
].gso_type
;
701 gso
->u
.gso
.size
= queue
->meta
[npo
.meta_cons
].gso_size
;
703 gso
->u
.gso
.features
= 0;
705 gso
->type
= XEN_NETIF_EXTRA_TYPE_GSO
;
709 xenvif_add_frag_responses(queue
, status
,
710 queue
->meta
+ npo
.meta_cons
+ 1,
711 XENVIF_RX_CB(skb
)->meta_slots_used
);
713 RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&queue
->rx
, ret
);
715 need_to_notify
|= !!ret
;
717 npo
.meta_cons
+= XENVIF_RX_CB(skb
)->meta_slots_used
;
723 notify_remote_via_irq(queue
->rx_irq
);
726 void xenvif_napi_schedule_or_enable_events(struct xenvif_queue
*queue
)
730 RING_FINAL_CHECK_FOR_REQUESTS(&queue
->tx
, more_to_do
);
733 napi_schedule(&queue
->napi
);
736 static void tx_add_credit(struct xenvif_queue
*queue
)
738 unsigned long max_burst
, max_credit
;
741 * Allow a burst big enough to transmit a jumbo packet of up to 128kB.
742 * Otherwise the interface can seize up due to insufficient credit.
744 max_burst
= RING_GET_REQUEST(&queue
->tx
, queue
->tx
.req_cons
)->size
;
745 max_burst
= min(max_burst
, 131072UL);
746 max_burst
= max(max_burst
, queue
->credit_bytes
);
748 /* Take care that adding a new chunk of credit doesn't wrap to zero. */
749 max_credit
= queue
->remaining_credit
+ queue
->credit_bytes
;
750 if (max_credit
< queue
->remaining_credit
)
751 max_credit
= ULONG_MAX
; /* wrapped: clamp to ULONG_MAX */
753 queue
->remaining_credit
= min(max_credit
, max_burst
);
756 static void tx_credit_callback(unsigned long data
)
758 struct xenvif_queue
*queue
= (struct xenvif_queue
*)data
;
759 tx_add_credit(queue
);
760 xenvif_napi_schedule_or_enable_events(queue
);
763 static void xenvif_tx_err(struct xenvif_queue
*queue
,
764 struct xen_netif_tx_request
*txp
, RING_IDX end
)
766 RING_IDX cons
= queue
->tx
.req_cons
;
770 spin_lock_irqsave(&queue
->response_lock
, flags
);
771 make_tx_response(queue
, txp
, XEN_NETIF_RSP_ERROR
);
772 spin_unlock_irqrestore(&queue
->response_lock
, flags
);
775 txp
= RING_GET_REQUEST(&queue
->tx
, cons
++);
777 queue
->tx
.req_cons
= cons
;
780 static void xenvif_fatal_tx_err(struct xenvif
*vif
)
782 netdev_err(vif
->dev
, "fatal error; disabling device\n");
783 vif
->disabled
= true;
784 /* Disable the vif from queue 0's kthread */
786 xenvif_kick_thread(&vif
->queues
[0]);
789 static int xenvif_count_requests(struct xenvif_queue
*queue
,
790 struct xen_netif_tx_request
*first
,
791 struct xen_netif_tx_request
*txp
,
794 RING_IDX cons
= queue
->tx
.req_cons
;
799 if (!(first
->flags
& XEN_NETTXF_more_data
))
803 struct xen_netif_tx_request dropped_tx
= { 0 };
805 if (slots
>= work_to_do
) {
806 netdev_err(queue
->vif
->dev
,
807 "Asked for %d slots but exceeds this limit\n",
809 xenvif_fatal_tx_err(queue
->vif
);
813 /* This guest is really using too many slots and
814 * considered malicious.
816 if (unlikely(slots
>= fatal_skb_slots
)) {
817 netdev_err(queue
->vif
->dev
,
818 "Malicious frontend using %d slots, threshold %u\n",
819 slots
, fatal_skb_slots
);
820 xenvif_fatal_tx_err(queue
->vif
);
824 /* Xen network protocol had implicit dependency on
825 * MAX_SKB_FRAGS. XEN_NETBK_LEGACY_SLOTS_MAX is set to
826 * the historical MAX_SKB_FRAGS value 18 to honor the
827 * same behavior as before. Any packet using more than
828 * 18 slots but less than fatal_skb_slots slots is
831 if (!drop_err
&& slots
>= XEN_NETBK_LEGACY_SLOTS_MAX
) {
833 netdev_dbg(queue
->vif
->dev
,
834 "Too many slots (%d) exceeding limit (%d), dropping packet\n",
835 slots
, XEN_NETBK_LEGACY_SLOTS_MAX
);
842 memcpy(txp
, RING_GET_REQUEST(&queue
->tx
, cons
+ slots
),
845 /* If the guest submitted a frame >= 64 KiB then
846 * first->size overflowed and following slots will
847 * appear to be larger than the frame.
849 * This cannot be fatal error as there are buggy
850 * frontends that do this.
852 * Consume all slots and drop the packet.
854 if (!drop_err
&& txp
->size
> first
->size
) {
856 netdev_dbg(queue
->vif
->dev
,
857 "Invalid tx request, slot size %u > remaining size %u\n",
858 txp
->size
, first
->size
);
862 first
->size
-= txp
->size
;
865 if (unlikely((txp
->offset
+ txp
->size
) > PAGE_SIZE
)) {
866 netdev_err(queue
->vif
->dev
, "Cross page boundary, txp->offset: %x, size: %u\n",
867 txp
->offset
, txp
->size
);
868 xenvif_fatal_tx_err(queue
->vif
);
872 more_data
= txp
->flags
& XEN_NETTXF_more_data
;
880 xenvif_tx_err(queue
, first
, cons
+ slots
);
888 struct xenvif_tx_cb
{
892 #define XENVIF_TX_CB(skb) ((struct xenvif_tx_cb *)(skb)->cb)
894 static inline void xenvif_tx_create_map_op(struct xenvif_queue
*queue
,
896 struct xen_netif_tx_request
*txp
,
897 struct gnttab_map_grant_ref
*mop
)
899 queue
->pages_to_map
[mop
-queue
->tx_map_ops
] = queue
->mmap_pages
[pending_idx
];
900 gnttab_set_map_op(mop
, idx_to_kaddr(queue
, pending_idx
),
901 GNTMAP_host_map
| GNTMAP_readonly
,
902 txp
->gref
, queue
->vif
->domid
);
904 memcpy(&queue
->pending_tx_info
[pending_idx
].req
, txp
,
908 static inline struct sk_buff
*xenvif_alloc_skb(unsigned int size
)
910 struct sk_buff
*skb
=
911 alloc_skb(size
+ NET_SKB_PAD
+ NET_IP_ALIGN
,
912 GFP_ATOMIC
| __GFP_NOWARN
);
913 if (unlikely(skb
== NULL
))
916 /* Packets passed to netif_rx() must have some headroom. */
917 skb_reserve(skb
, NET_SKB_PAD
+ NET_IP_ALIGN
);
919 /* Initialize it here to avoid later surprises */
920 skb_shinfo(skb
)->destructor_arg
= NULL
;
925 static struct gnttab_map_grant_ref
*xenvif_get_requests(struct xenvif_queue
*queue
,
927 struct xen_netif_tx_request
*txp
,
928 struct gnttab_map_grant_ref
*gop
)
930 struct skb_shared_info
*shinfo
= skb_shinfo(skb
);
931 skb_frag_t
*frags
= shinfo
->frags
;
932 u16 pending_idx
= XENVIF_TX_CB(skb
)->pending_idx
;
934 pending_ring_idx_t index
;
935 unsigned int nr_slots
, frag_overflow
= 0;
937 /* At this point shinfo->nr_frags is in fact the number of
938 * slots, which can be as large as XEN_NETBK_LEGACY_SLOTS_MAX.
940 if (shinfo
->nr_frags
> MAX_SKB_FRAGS
) {
941 frag_overflow
= shinfo
->nr_frags
- MAX_SKB_FRAGS
;
942 BUG_ON(frag_overflow
> MAX_SKB_FRAGS
);
943 shinfo
->nr_frags
= MAX_SKB_FRAGS
;
945 nr_slots
= shinfo
->nr_frags
;
947 /* Skip first skb fragment if it is on same page as header fragment. */
948 start
= (frag_get_pending_idx(&shinfo
->frags
[0]) == pending_idx
);
950 for (shinfo
->nr_frags
= start
; shinfo
->nr_frags
< nr_slots
;
951 shinfo
->nr_frags
++, txp
++, gop
++) {
952 index
= pending_index(queue
->pending_cons
++);
953 pending_idx
= queue
->pending_ring
[index
];
954 xenvif_tx_create_map_op(queue
, pending_idx
, txp
, gop
);
955 frag_set_pending_idx(&frags
[shinfo
->nr_frags
], pending_idx
);
959 struct sk_buff
*nskb
= xenvif_alloc_skb(0);
960 if (unlikely(nskb
== NULL
)) {
962 netdev_err(queue
->vif
->dev
,
963 "Can't allocate the frag_list skb.\n");
967 shinfo
= skb_shinfo(nskb
);
968 frags
= shinfo
->frags
;
970 for (shinfo
->nr_frags
= 0; shinfo
->nr_frags
< frag_overflow
;
971 shinfo
->nr_frags
++, txp
++, gop
++) {
972 index
= pending_index(queue
->pending_cons
++);
973 pending_idx
= queue
->pending_ring
[index
];
974 xenvif_tx_create_map_op(queue
, pending_idx
, txp
, gop
);
975 frag_set_pending_idx(&frags
[shinfo
->nr_frags
],
979 skb_shinfo(skb
)->frag_list
= nskb
;
985 static inline void xenvif_grant_handle_set(struct xenvif_queue
*queue
,
987 grant_handle_t handle
)
989 if (unlikely(queue
->grant_tx_handle
[pending_idx
] !=
990 NETBACK_INVALID_HANDLE
)) {
991 netdev_err(queue
->vif
->dev
,
992 "Trying to overwrite active handle! pending_idx: %x\n",
996 queue
->grant_tx_handle
[pending_idx
] = handle
;
999 static inline void xenvif_grant_handle_reset(struct xenvif_queue
*queue
,
1002 if (unlikely(queue
->grant_tx_handle
[pending_idx
] ==
1003 NETBACK_INVALID_HANDLE
)) {
1004 netdev_err(queue
->vif
->dev
,
1005 "Trying to unmap invalid handle! pending_idx: %x\n",
1009 queue
->grant_tx_handle
[pending_idx
] = NETBACK_INVALID_HANDLE
;
1012 static int xenvif_tx_check_gop(struct xenvif_queue
*queue
,
1013 struct sk_buff
*skb
,
1014 struct gnttab_map_grant_ref
**gopp_map
,
1015 struct gnttab_copy
**gopp_copy
)
1017 struct gnttab_map_grant_ref
*gop_map
= *gopp_map
;
1018 u16 pending_idx
= XENVIF_TX_CB(skb
)->pending_idx
;
1019 struct skb_shared_info
*shinfo
= skb_shinfo(skb
);
1020 int nr_frags
= shinfo
->nr_frags
;
1022 struct sk_buff
*first_skb
= NULL
;
1024 /* Check status of header. */
1025 err
= (*gopp_copy
)->status
;
1027 if (unlikely(err
)) {
1028 if (net_ratelimit())
1029 netdev_dbg(queue
->vif
->dev
,
1030 "Grant copy of header failed! status: %d pending_idx: %u ref: %u\n",
1031 (*gopp_copy
)->status
,
1033 (*gopp_copy
)->source
.u
.ref
);
1034 xenvif_idx_release(queue
, pending_idx
, XEN_NETIF_RSP_ERROR
);
1038 for (i
= 0; i
< nr_frags
; i
++, gop_map
++) {
1041 pending_idx
= frag_get_pending_idx(&shinfo
->frags
[i
]);
1043 /* Check error status: if okay then remember grant handle. */
1044 newerr
= gop_map
->status
;
1046 if (likely(!newerr
)) {
1047 xenvif_grant_handle_set(queue
,
1050 /* Had a previous error? Invalidate this fragment. */
1052 xenvif_idx_unmap(queue
, pending_idx
);
1056 /* Error on this fragment: respond to client with an error. */
1057 if (net_ratelimit())
1058 netdev_dbg(queue
->vif
->dev
,
1059 "Grant map of %d. frag failed! status: %d pending_idx: %u ref: %u\n",
1064 xenvif_idx_release(queue
, pending_idx
, XEN_NETIF_RSP_ERROR
);
1066 /* Not the first error? Preceding frags already invalidated. */
1069 /* First error: invalidate preceding fragments. */
1070 for (j
= 0; j
< i
; j
++) {
1071 pending_idx
= frag_get_pending_idx(&shinfo
->frags
[j
]);
1072 xenvif_idx_unmap(queue
, pending_idx
);
1075 /* Remember the error: invalidate all subsequent fragments. */
1079 if (skb_has_frag_list(skb
)) {
1081 skb
= shinfo
->frag_list
;
1082 shinfo
= skb_shinfo(skb
);
1083 nr_frags
= shinfo
->nr_frags
;
1088 /* There was a mapping error in the frag_list skb. We have to unmap
1089 * the first skb's frags
1091 if (first_skb
&& err
) {
1093 shinfo
= skb_shinfo(first_skb
);
1094 for (j
= 0; j
< shinfo
->nr_frags
; j
++) {
1095 pending_idx
= frag_get_pending_idx(&shinfo
->frags
[j
]);
1096 xenvif_idx_unmap(queue
, pending_idx
);
1100 *gopp_map
= gop_map
;
1104 static void xenvif_fill_frags(struct xenvif_queue
*queue
, struct sk_buff
*skb
)
1106 struct skb_shared_info
*shinfo
= skb_shinfo(skb
);
1107 int nr_frags
= shinfo
->nr_frags
;
1109 u16 prev_pending_idx
= INVALID_PENDING_IDX
;
1111 for (i
= 0; i
< nr_frags
; i
++) {
1112 skb_frag_t
*frag
= shinfo
->frags
+ i
;
1113 struct xen_netif_tx_request
*txp
;
1117 pending_idx
= frag_get_pending_idx(frag
);
1119 /* If this is not the first frag, chain it to the previous*/
1120 if (prev_pending_idx
== INVALID_PENDING_IDX
)
1121 skb_shinfo(skb
)->destructor_arg
=
1122 &callback_param(queue
, pending_idx
);
1124 callback_param(queue
, prev_pending_idx
).ctx
=
1125 &callback_param(queue
, pending_idx
);
1127 callback_param(queue
, pending_idx
).ctx
= NULL
;
1128 prev_pending_idx
= pending_idx
;
1130 txp
= &queue
->pending_tx_info
[pending_idx
].req
;
1131 page
= virt_to_page(idx_to_kaddr(queue
, pending_idx
));
1132 __skb_fill_page_desc(skb
, i
, page
, txp
->offset
, txp
->size
);
1133 skb
->len
+= txp
->size
;
1134 skb
->data_len
+= txp
->size
;
1135 skb
->truesize
+= txp
->size
;
1137 /* Take an extra reference to offset network stack's put_page */
1138 get_page(queue
->mmap_pages
[pending_idx
]);
1140 /* FIXME: __skb_fill_page_desc set this to true because page->pfmemalloc
1141 * overlaps with "index", and "mapping" is not set. I think mapping
1142 * should be set. If delivered to local stack, it would drop this
1143 * skb in sk_filter unless the socket has the right to use it.
1145 skb
->pfmemalloc
= false;
1148 static int xenvif_get_extras(struct xenvif_queue
*queue
,
1149 struct xen_netif_extra_info
*extras
,
1152 struct xen_netif_extra_info extra
;
1153 RING_IDX cons
= queue
->tx
.req_cons
;
1156 if (unlikely(work_to_do
-- <= 0)) {
1157 netdev_err(queue
->vif
->dev
, "Missing extra info\n");
1158 xenvif_fatal_tx_err(queue
->vif
);
1162 memcpy(&extra
, RING_GET_REQUEST(&queue
->tx
, cons
),
1164 if (unlikely(!extra
.type
||
1165 extra
.type
>= XEN_NETIF_EXTRA_TYPE_MAX
)) {
1166 queue
->tx
.req_cons
= ++cons
;
1167 netdev_err(queue
->vif
->dev
,
1168 "Invalid extra type: %d\n", extra
.type
);
1169 xenvif_fatal_tx_err(queue
->vif
);
1173 memcpy(&extras
[extra
.type
- 1], &extra
, sizeof(extra
));
1174 queue
->tx
.req_cons
= ++cons
;
1175 } while (extra
.flags
& XEN_NETIF_EXTRA_FLAG_MORE
);
1180 static int xenvif_set_skb_gso(struct xenvif
*vif
,
1181 struct sk_buff
*skb
,
1182 struct xen_netif_extra_info
*gso
)
1184 if (!gso
->u
.gso
.size
) {
1185 netdev_err(vif
->dev
, "GSO size must not be zero.\n");
1186 xenvif_fatal_tx_err(vif
);
1190 switch (gso
->u
.gso
.type
) {
1191 case XEN_NETIF_GSO_TYPE_TCPV4
:
1192 skb_shinfo(skb
)->gso_type
= SKB_GSO_TCPV4
;
1194 case XEN_NETIF_GSO_TYPE_TCPV6
:
1195 skb_shinfo(skb
)->gso_type
= SKB_GSO_TCPV6
;
1198 netdev_err(vif
->dev
, "Bad GSO type %d.\n", gso
->u
.gso
.type
);
1199 xenvif_fatal_tx_err(vif
);
1203 skb_shinfo(skb
)->gso_size
= gso
->u
.gso
.size
;
1204 /* gso_segs will be calculated later */
1209 static int checksum_setup(struct xenvif_queue
*queue
, struct sk_buff
*skb
)
1211 bool recalculate_partial_csum
= false;
1213 /* A GSO SKB must be CHECKSUM_PARTIAL. However some buggy
1214 * peers can fail to set NETRXF_csum_blank when sending a GSO
1215 * frame. In this case force the SKB to CHECKSUM_PARTIAL and
1216 * recalculate the partial checksum.
1218 if (skb
->ip_summed
!= CHECKSUM_PARTIAL
&& skb_is_gso(skb
)) {
1219 queue
->stats
.rx_gso_checksum_fixup
++;
1220 skb
->ip_summed
= CHECKSUM_PARTIAL
;
1221 recalculate_partial_csum
= true;
1224 /* A non-CHECKSUM_PARTIAL SKB does not require setup. */
1225 if (skb
->ip_summed
!= CHECKSUM_PARTIAL
)
1228 return skb_checksum_setup(skb
, recalculate_partial_csum
);
1231 static bool tx_credit_exceeded(struct xenvif_queue
*queue
, unsigned size
)
1233 u64 now
= get_jiffies_64();
1234 u64 next_credit
= queue
->credit_window_start
+
1235 msecs_to_jiffies(queue
->credit_usec
/ 1000);
1237 /* Timer could already be pending in rare cases. */
1238 if (timer_pending(&queue
->credit_timeout
))
1241 /* Passed the point where we can replenish credit? */
1242 if (time_after_eq64(now
, next_credit
)) {
1243 queue
->credit_window_start
= now
;
1244 tx_add_credit(queue
);
1247 /* Still too big to send right now? Set a callback. */
1248 if (size
> queue
->remaining_credit
) {
1249 queue
->credit_timeout
.data
=
1250 (unsigned long)queue
;
1251 queue
->credit_timeout
.function
=
1253 mod_timer(&queue
->credit_timeout
,
1255 queue
->credit_window_start
= next_credit
;
1263 static void xenvif_tx_build_gops(struct xenvif_queue
*queue
,
1268 struct gnttab_map_grant_ref
*gop
= queue
->tx_map_ops
, *request_gop
;
1269 struct sk_buff
*skb
;
1272 while (skb_queue_len(&queue
->tx_queue
) < budget
) {
1273 struct xen_netif_tx_request txreq
;
1274 struct xen_netif_tx_request txfrags
[XEN_NETBK_LEGACY_SLOTS_MAX
];
1275 struct xen_netif_extra_info extras
[XEN_NETIF_EXTRA_TYPE_MAX
-1];
1279 unsigned int data_len
;
1280 pending_ring_idx_t index
;
1282 if (queue
->tx
.sring
->req_prod
- queue
->tx
.req_cons
>
1283 XEN_NETIF_TX_RING_SIZE
) {
1284 netdev_err(queue
->vif
->dev
,
1285 "Impossible number of requests. "
1286 "req_prod %d, req_cons %d, size %ld\n",
1287 queue
->tx
.sring
->req_prod
, queue
->tx
.req_cons
,
1288 XEN_NETIF_TX_RING_SIZE
);
1289 xenvif_fatal_tx_err(queue
->vif
);
1293 work_to_do
= RING_HAS_UNCONSUMED_REQUESTS(&queue
->tx
);
1297 idx
= queue
->tx
.req_cons
;
1298 rmb(); /* Ensure that we see the request before we copy it. */
1299 memcpy(&txreq
, RING_GET_REQUEST(&queue
->tx
, idx
), sizeof(txreq
));
1301 /* Credit-based scheduling. */
1302 if (txreq
.size
> queue
->remaining_credit
&&
1303 tx_credit_exceeded(queue
, txreq
.size
))
1306 queue
->remaining_credit
-= txreq
.size
;
1309 queue
->tx
.req_cons
= ++idx
;
1311 memset(extras
, 0, sizeof(extras
));
1312 if (txreq
.flags
& XEN_NETTXF_extra_info
) {
1313 work_to_do
= xenvif_get_extras(queue
, extras
,
1315 idx
= queue
->tx
.req_cons
;
1316 if (unlikely(work_to_do
< 0))
1320 ret
= xenvif_count_requests(queue
, &txreq
, txfrags
, work_to_do
);
1321 if (unlikely(ret
< 0))
1326 if (unlikely(txreq
.size
< ETH_HLEN
)) {
1327 netdev_dbg(queue
->vif
->dev
,
1328 "Bad packet size: %d\n", txreq
.size
);
1329 xenvif_tx_err(queue
, &txreq
, idx
);
1333 /* No crossing a page as the payload mustn't fragment. */
1334 if (unlikely((txreq
.offset
+ txreq
.size
) > PAGE_SIZE
)) {
1335 netdev_err(queue
->vif
->dev
,
1336 "txreq.offset: %x, size: %u, end: %lu\n",
1337 txreq
.offset
, txreq
.size
,
1338 (txreq
.offset
&~PAGE_MASK
) + txreq
.size
);
1339 xenvif_fatal_tx_err(queue
->vif
);
1343 index
= pending_index(queue
->pending_cons
);
1344 pending_idx
= queue
->pending_ring
[index
];
1346 data_len
= (txreq
.size
> PKT_PROT_LEN
&&
1347 ret
< XEN_NETBK_LEGACY_SLOTS_MAX
) ?
1348 PKT_PROT_LEN
: txreq
.size
;
1350 skb
= xenvif_alloc_skb(data_len
);
1351 if (unlikely(skb
== NULL
)) {
1352 netdev_dbg(queue
->vif
->dev
,
1353 "Can't allocate a skb in start_xmit.\n");
1354 xenvif_tx_err(queue
, &txreq
, idx
);
1358 if (extras
[XEN_NETIF_EXTRA_TYPE_GSO
- 1].type
) {
1359 struct xen_netif_extra_info
*gso
;
1360 gso
= &extras
[XEN_NETIF_EXTRA_TYPE_GSO
- 1];
1362 if (xenvif_set_skb_gso(queue
->vif
, skb
, gso
)) {
1363 /* Failure in xenvif_set_skb_gso is fatal. */
1369 XENVIF_TX_CB(skb
)->pending_idx
= pending_idx
;
1371 __skb_put(skb
, data_len
);
1372 queue
->tx_copy_ops
[*copy_ops
].source
.u
.ref
= txreq
.gref
;
1373 queue
->tx_copy_ops
[*copy_ops
].source
.domid
= queue
->vif
->domid
;
1374 queue
->tx_copy_ops
[*copy_ops
].source
.offset
= txreq
.offset
;
1376 queue
->tx_copy_ops
[*copy_ops
].dest
.u
.gmfn
=
1377 virt_to_mfn(skb
->data
);
1378 queue
->tx_copy_ops
[*copy_ops
].dest
.domid
= DOMID_SELF
;
1379 queue
->tx_copy_ops
[*copy_ops
].dest
.offset
=
1380 offset_in_page(skb
->data
);
1382 queue
->tx_copy_ops
[*copy_ops
].len
= data_len
;
1383 queue
->tx_copy_ops
[*copy_ops
].flags
= GNTCOPY_source_gref
;
1387 skb_shinfo(skb
)->nr_frags
= ret
;
1388 if (data_len
< txreq
.size
) {
1389 skb_shinfo(skb
)->nr_frags
++;
1390 frag_set_pending_idx(&skb_shinfo(skb
)->frags
[0],
1392 xenvif_tx_create_map_op(queue
, pending_idx
, &txreq
, gop
);
1395 frag_set_pending_idx(&skb_shinfo(skb
)->frags
[0],
1396 INVALID_PENDING_IDX
);
1397 memcpy(&queue
->pending_tx_info
[pending_idx
].req
, &txreq
,
1401 queue
->pending_cons
++;
1403 request_gop
= xenvif_get_requests(queue
, skb
, txfrags
, gop
);
1404 if (request_gop
== NULL
) {
1406 xenvif_tx_err(queue
, &txreq
, idx
);
1411 __skb_queue_tail(&queue
->tx_queue
, skb
);
1413 queue
->tx
.req_cons
= idx
;
1415 if (((gop
-queue
->tx_map_ops
) >= ARRAY_SIZE(queue
->tx_map_ops
)) ||
1416 (*copy_ops
>= ARRAY_SIZE(queue
->tx_copy_ops
)))
1420 (*map_ops
) = gop
- queue
->tx_map_ops
;
1424 /* Consolidate skb with a frag_list into a brand new one with local pages on
1425 * frags. Returns 0 or -ENOMEM if can't allocate new pages.
1427 static int xenvif_handle_frag_list(struct xenvif_queue
*queue
, struct sk_buff
*skb
)
1429 unsigned int offset
= skb_headlen(skb
);
1430 skb_frag_t frags
[MAX_SKB_FRAGS
];
1432 struct ubuf_info
*uarg
;
1433 struct sk_buff
*nskb
= skb_shinfo(skb
)->frag_list
;
1435 queue
->stats
.tx_zerocopy_sent
+= 2;
1436 queue
->stats
.tx_frag_overflow
++;
1438 xenvif_fill_frags(queue
, nskb
);
1439 /* Subtract frags size, we will correct it later */
1440 skb
->truesize
-= skb
->data_len
;
1441 skb
->len
+= nskb
->len
;
1442 skb
->data_len
+= nskb
->len
;
1444 /* create a brand new frags array and coalesce there */
1445 for (i
= 0; offset
< skb
->len
; i
++) {
1449 BUG_ON(i
>= MAX_SKB_FRAGS
);
1450 page
= alloc_page(GFP_ATOMIC
|__GFP_COLD
);
1453 skb
->truesize
+= skb
->data_len
;
1454 for (j
= 0; j
< i
; j
++)
1455 put_page(frags
[j
].page
.p
);
1459 if (offset
+ PAGE_SIZE
< skb
->len
)
1462 len
= skb
->len
- offset
;
1463 if (skb_copy_bits(skb
, offset
, page_address(page
), len
))
1467 frags
[i
].page
.p
= page
;
1468 frags
[i
].page_offset
= 0;
1469 skb_frag_size_set(&frags
[i
], len
);
1471 /* swap out with old one */
1472 memcpy(skb_shinfo(skb
)->frags
,
1474 i
* sizeof(skb_frag_t
));
1475 skb_shinfo(skb
)->nr_frags
= i
;
1476 skb
->truesize
+= i
* PAGE_SIZE
;
1478 /* remove traces of mapped pages and frag_list */
1479 skb_frag_list_init(skb
);
1480 uarg
= skb_shinfo(skb
)->destructor_arg
;
1481 uarg
->callback(uarg
, true);
1482 skb_shinfo(skb
)->destructor_arg
= NULL
;
1484 skb_shinfo(nskb
)->tx_flags
|= SKBTX_DEV_ZEROCOPY
;
1490 static int xenvif_tx_submit(struct xenvif_queue
*queue
)
1492 struct gnttab_map_grant_ref
*gop_map
= queue
->tx_map_ops
;
1493 struct gnttab_copy
*gop_copy
= queue
->tx_copy_ops
;
1494 struct sk_buff
*skb
;
1497 while ((skb
= __skb_dequeue(&queue
->tx_queue
)) != NULL
) {
1498 struct xen_netif_tx_request
*txp
;
1502 pending_idx
= XENVIF_TX_CB(skb
)->pending_idx
;
1503 txp
= &queue
->pending_tx_info
[pending_idx
].req
;
1505 /* Check the remap error code. */
1506 if (unlikely(xenvif_tx_check_gop(queue
, skb
, &gop_map
, &gop_copy
))) {
1507 skb_shinfo(skb
)->nr_frags
= 0;
1512 data_len
= skb
->len
;
1513 callback_param(queue
, pending_idx
).ctx
= NULL
;
1514 if (data_len
< txp
->size
) {
1515 /* Append the packet payload as a fragment. */
1516 txp
->offset
+= data_len
;
1517 txp
->size
-= data_len
;
1519 /* Schedule a response immediately. */
1520 xenvif_idx_release(queue
, pending_idx
,
1521 XEN_NETIF_RSP_OKAY
);
1524 if (txp
->flags
& XEN_NETTXF_csum_blank
)
1525 skb
->ip_summed
= CHECKSUM_PARTIAL
;
1526 else if (txp
->flags
& XEN_NETTXF_data_validated
)
1527 skb
->ip_summed
= CHECKSUM_UNNECESSARY
;
1529 xenvif_fill_frags(queue
, skb
);
1531 if (unlikely(skb_has_frag_list(skb
))) {
1532 if (xenvif_handle_frag_list(queue
, skb
)) {
1533 if (net_ratelimit())
1534 netdev_err(queue
->vif
->dev
,
1535 "Not enough memory to consolidate frag_list!\n");
1536 skb_shinfo(skb
)->tx_flags
|= SKBTX_DEV_ZEROCOPY
;
1542 if (skb_is_nonlinear(skb
) && skb_headlen(skb
) < PKT_PROT_LEN
) {
1543 int target
= min_t(int, skb
->len
, PKT_PROT_LEN
);
1544 __pskb_pull_tail(skb
, target
- skb_headlen(skb
));
1547 skb
->dev
= queue
->vif
->dev
;
1548 skb
->protocol
= eth_type_trans(skb
, skb
->dev
);
1549 skb_reset_network_header(skb
);
1551 if (checksum_setup(queue
, skb
)) {
1552 netdev_dbg(queue
->vif
->dev
,
1553 "Can't setup checksum in net_tx_action\n");
1554 /* We have to set this flag to trigger the callback */
1555 if (skb_shinfo(skb
)->destructor_arg
)
1556 skb_shinfo(skb
)->tx_flags
|= SKBTX_DEV_ZEROCOPY
;
1561 skb_probe_transport_header(skb
, 0);
1563 /* If the packet is GSO then we will have just set up the
1564 * transport header offset in checksum_setup so it's now
1565 * straightforward to calculate gso_segs.
1567 if (skb_is_gso(skb
)) {
1568 int mss
= skb_shinfo(skb
)->gso_size
;
1569 int hdrlen
= skb_transport_header(skb
) -
1570 skb_mac_header(skb
) +
1573 skb_shinfo(skb
)->gso_segs
=
1574 DIV_ROUND_UP(skb
->len
- hdrlen
, mss
);
1577 queue
->stats
.rx_bytes
+= skb
->len
;
1578 queue
->stats
.rx_packets
++;
1582 /* Set this flag right before netif_receive_skb, otherwise
1583 * someone might think this packet already left netback, and
1584 * do a skb_copy_ubufs while we are still in control of the
1585 * skb. E.g. the __pskb_pull_tail earlier can do such thing.
1587 if (skb_shinfo(skb
)->destructor_arg
) {
1588 skb_shinfo(skb
)->tx_flags
|= SKBTX_DEV_ZEROCOPY
;
1589 queue
->stats
.tx_zerocopy_sent
++;
1592 netif_receive_skb(skb
);
1598 void xenvif_zerocopy_callback(struct ubuf_info
*ubuf
, bool zerocopy_success
)
1600 unsigned long flags
;
1601 pending_ring_idx_t index
;
1602 struct xenvif_queue
*queue
= ubuf_to_queue(ubuf
);
1604 /* This is the only place where we grab this lock, to protect callbacks
1607 spin_lock_irqsave(&queue
->callback_lock
, flags
);
1609 u16 pending_idx
= ubuf
->desc
;
1610 ubuf
= (struct ubuf_info
*) ubuf
->ctx
;
1611 BUG_ON(queue
->dealloc_prod
- queue
->dealloc_cons
>=
1613 index
= pending_index(queue
->dealloc_prod
);
1614 queue
->dealloc_ring
[index
] = pending_idx
;
1615 /* Sync with xenvif_tx_dealloc_action:
1616 * insert idx then incr producer.
1619 queue
->dealloc_prod
++;
1621 wake_up(&queue
->dealloc_wq
);
1622 spin_unlock_irqrestore(&queue
->callback_lock
, flags
);
1624 if (likely(zerocopy_success
))
1625 queue
->stats
.tx_zerocopy_success
++;
1627 queue
->stats
.tx_zerocopy_fail
++;
1630 static inline void xenvif_tx_dealloc_action(struct xenvif_queue
*queue
)
1632 struct gnttab_unmap_grant_ref
*gop
;
1633 pending_ring_idx_t dc
, dp
;
1634 u16 pending_idx
, pending_idx_release
[MAX_PENDING_REQS
];
1637 dc
= queue
->dealloc_cons
;
1638 gop
= queue
->tx_unmap_ops
;
1640 /* Free up any grants we have finished using */
1642 dp
= queue
->dealloc_prod
;
1644 /* Ensure we see all indices enqueued by all
1645 * xenvif_zerocopy_callback().
1650 BUG_ON(gop
- queue
->tx_unmap_ops
> MAX_PENDING_REQS
);
1652 queue
->dealloc_ring
[pending_index(dc
++)];
1654 pending_idx_release
[gop
-queue
->tx_unmap_ops
] =
1656 queue
->pages_to_unmap
[gop
-queue
->tx_unmap_ops
] =
1657 queue
->mmap_pages
[pending_idx
];
1658 gnttab_set_unmap_op(gop
,
1659 idx_to_kaddr(queue
, pending_idx
),
1661 queue
->grant_tx_handle
[pending_idx
]);
1662 xenvif_grant_handle_reset(queue
, pending_idx
);
1666 } while (dp
!= queue
->dealloc_prod
);
1668 queue
->dealloc_cons
= dc
;
1670 if (gop
- queue
->tx_unmap_ops
> 0) {
1672 ret
= gnttab_unmap_refs(queue
->tx_unmap_ops
,
1674 queue
->pages_to_unmap
,
1675 gop
- queue
->tx_unmap_ops
);
1677 netdev_err(queue
->vif
->dev
, "Unmap fail: nr_ops %tx ret %d\n",
1678 gop
- queue
->tx_unmap_ops
, ret
);
1679 for (i
= 0; i
< gop
- queue
->tx_unmap_ops
; ++i
) {
1680 if (gop
[i
].status
!= GNTST_okay
)
1681 netdev_err(queue
->vif
->dev
,
1682 " host_addr: %llx handle: %x status: %d\n",
1691 for (i
= 0; i
< gop
- queue
->tx_unmap_ops
; ++i
)
1692 xenvif_idx_release(queue
, pending_idx_release
[i
],
1693 XEN_NETIF_RSP_OKAY
);
1697 /* Called after netfront has transmitted */
1698 int xenvif_tx_action(struct xenvif_queue
*queue
, int budget
)
1700 unsigned nr_mops
, nr_cops
= 0;
1703 if (unlikely(!tx_work_todo(queue
)))
1706 xenvif_tx_build_gops(queue
, budget
, &nr_cops
, &nr_mops
);
1711 gnttab_batch_copy(queue
->tx_copy_ops
, nr_cops
);
1713 ret
= gnttab_map_refs(queue
->tx_map_ops
,
1715 queue
->pages_to_map
,
1720 work_done
= xenvif_tx_submit(queue
);
1725 static void xenvif_idx_release(struct xenvif_queue
*queue
, u16 pending_idx
,
1728 struct pending_tx_info
*pending_tx_info
;
1729 pending_ring_idx_t index
;
1730 unsigned long flags
;
1732 pending_tx_info
= &queue
->pending_tx_info
[pending_idx
];
1733 spin_lock_irqsave(&queue
->response_lock
, flags
);
1734 make_tx_response(queue
, &pending_tx_info
->req
, status
);
1735 index
= pending_index(queue
->pending_prod
);
1736 queue
->pending_ring
[index
] = pending_idx
;
1737 /* TX shouldn't use the index before we give it back here */
1739 queue
->pending_prod
++;
1740 spin_unlock_irqrestore(&queue
->response_lock
, flags
);
1744 static void make_tx_response(struct xenvif_queue
*queue
,
1745 struct xen_netif_tx_request
*txp
,
1748 RING_IDX i
= queue
->tx
.rsp_prod_pvt
;
1749 struct xen_netif_tx_response
*resp
;
1752 resp
= RING_GET_RESPONSE(&queue
->tx
, i
);
1756 if (txp
->flags
& XEN_NETTXF_extra_info
)
1757 RING_GET_RESPONSE(&queue
->tx
, ++i
)->status
= XEN_NETIF_RSP_NULL
;
1759 queue
->tx
.rsp_prod_pvt
= ++i
;
1760 RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&queue
->tx
, notify
);
1762 notify_remote_via_irq(queue
->tx_irq
);
1765 static struct xen_netif_rx_response
*make_rx_response(struct xenvif_queue
*queue
,
1772 RING_IDX i
= queue
->rx
.rsp_prod_pvt
;
1773 struct xen_netif_rx_response
*resp
;
1775 resp
= RING_GET_RESPONSE(&queue
->rx
, i
);
1776 resp
->offset
= offset
;
1777 resp
->flags
= flags
;
1779 resp
->status
= (s16
)size
;
1781 resp
->status
= (s16
)st
;
1783 queue
->rx
.rsp_prod_pvt
= ++i
;
1788 void xenvif_idx_unmap(struct xenvif_queue
*queue
, u16 pending_idx
)
1791 struct gnttab_unmap_grant_ref tx_unmap_op
;
1793 gnttab_set_unmap_op(&tx_unmap_op
,
1794 idx_to_kaddr(queue
, pending_idx
),
1796 queue
->grant_tx_handle
[pending_idx
]);
1797 xenvif_grant_handle_reset(queue
, pending_idx
);
1799 ret
= gnttab_unmap_refs(&tx_unmap_op
, NULL
,
1800 &queue
->mmap_pages
[pending_idx
], 1);
1802 netdev_err(queue
->vif
->dev
,
1803 "Unmap fail: ret: %d pending_idx: %d host_addr: %llx handle: %x status: %d\n",
1806 tx_unmap_op
.host_addr
,
1808 tx_unmap_op
.status
);
1812 xenvif_idx_release(queue
, pending_idx
, XEN_NETIF_RSP_OKAY
);
1815 static inline int rx_work_todo(struct xenvif_queue
*queue
)
1817 return (!skb_queue_empty(&queue
->rx_queue
) &&
1818 xenvif_rx_ring_slots_available(queue
, queue
->rx_last_skb_slots
)) ||
1819 queue
->rx_queue_purge
;
1822 static inline int tx_work_todo(struct xenvif_queue
*queue
)
1824 if (likely(RING_HAS_UNCONSUMED_REQUESTS(&queue
->tx
)))
1830 static inline bool tx_dealloc_work_todo(struct xenvif_queue
*queue
)
1832 return queue
->dealloc_cons
!= queue
->dealloc_prod
;
1835 void xenvif_unmap_frontend_rings(struct xenvif_queue
*queue
)
1837 if (queue
->tx
.sring
)
1838 xenbus_unmap_ring_vfree(xenvif_to_xenbus_device(queue
->vif
),
1840 if (queue
->rx
.sring
)
1841 xenbus_unmap_ring_vfree(xenvif_to_xenbus_device(queue
->vif
),
1845 int xenvif_map_frontend_rings(struct xenvif_queue
*queue
,
1846 grant_ref_t tx_ring_ref
,
1847 grant_ref_t rx_ring_ref
)
1850 struct xen_netif_tx_sring
*txs
;
1851 struct xen_netif_rx_sring
*rxs
;
1855 err
= xenbus_map_ring_valloc(xenvif_to_xenbus_device(queue
->vif
),
1856 tx_ring_ref
, &addr
);
1860 txs
= (struct xen_netif_tx_sring
*)addr
;
1861 BACK_RING_INIT(&queue
->tx
, txs
, PAGE_SIZE
);
1863 err
= xenbus_map_ring_valloc(xenvif_to_xenbus_device(queue
->vif
),
1864 rx_ring_ref
, &addr
);
1868 rxs
= (struct xen_netif_rx_sring
*)addr
;
1869 BACK_RING_INIT(&queue
->rx
, rxs
, PAGE_SIZE
);
1874 xenvif_unmap_frontend_rings(queue
);
1878 static void xenvif_start_queue(struct xenvif_queue
*queue
)
1880 if (xenvif_schedulable(queue
->vif
))
1881 xenvif_wake_queue(queue
);
1884 int xenvif_kthread_guest_rx(void *data
)
1886 struct xenvif_queue
*queue
= data
;
1887 struct sk_buff
*skb
;
1889 while (!kthread_should_stop()) {
1890 wait_event_interruptible(queue
->wq
,
1891 rx_work_todo(queue
) ||
1892 queue
->vif
->disabled
||
1893 kthread_should_stop());
1895 /* This frontend is found to be rogue, disable it in
1896 * kthread context. Currently this is only set when
1897 * netback finds out frontend sends malformed packet,
1898 * but we cannot disable the interface in softirq
1899 * context so we defer it here, if this thread is
1900 * associated with queue 0.
1902 if (unlikely(queue
->vif
->disabled
&& netif_carrier_ok(queue
->vif
->dev
) && queue
->id
== 0))
1903 xenvif_carrier_off(queue
->vif
);
1905 if (kthread_should_stop())
1908 if (queue
->rx_queue_purge
) {
1909 skb_queue_purge(&queue
->rx_queue
);
1910 queue
->rx_queue_purge
= false;
1913 if (!skb_queue_empty(&queue
->rx_queue
))
1914 xenvif_rx_action(queue
);
1916 if (skb_queue_empty(&queue
->rx_queue
) &&
1917 xenvif_queue_stopped(queue
)) {
1918 del_timer_sync(&queue
->wake_queue
);
1919 xenvif_start_queue(queue
);
1925 /* Bin any remaining skbs */
1926 while ((skb
= skb_dequeue(&queue
->rx_queue
)) != NULL
)
1932 int xenvif_dealloc_kthread(void *data
)
1934 struct xenvif_queue
*queue
= data
;
1936 while (!kthread_should_stop()) {
1937 wait_event_interruptible(queue
->dealloc_wq
,
1938 tx_dealloc_work_todo(queue
) ||
1939 kthread_should_stop());
1940 if (kthread_should_stop())
1943 xenvif_tx_dealloc_action(queue
);
1947 /* Unmap anything remaining*/
1948 if (tx_dealloc_work_todo(queue
))
1949 xenvif_tx_dealloc_action(queue
);
1954 static int __init
netback_init(void)
1961 /* Allow as many queues as there are CPUs, by default */
1962 xenvif_max_queues
= num_online_cpus();
1964 if (fatal_skb_slots
< XEN_NETBK_LEGACY_SLOTS_MAX
) {
1965 pr_info("fatal_skb_slots too small (%d), bump it to XEN_NETBK_LEGACY_SLOTS_MAX (%d)\n",
1966 fatal_skb_slots
, XEN_NETBK_LEGACY_SLOTS_MAX
);
1967 fatal_skb_slots
= XEN_NETBK_LEGACY_SLOTS_MAX
;
1970 rc
= xenvif_xenbus_init();
1974 rx_drain_timeout_jiffies
= msecs_to_jiffies(rx_drain_timeout_msecs
);
1982 module_init(netback_init
);
1984 static void __exit
netback_fini(void)
1986 xenvif_xenbus_fini();
1988 module_exit(netback_fini
);
1990 MODULE_LICENSE("Dual BSD/GPL");
1991 MODULE_ALIAS("xen-backend:vif");