This patch is generated against 2.6 git tree. I didn't break up this
patch since it has one functionality. Please review it.
Thanks
Shirley
Signed-off-by: Shirley Ma <xma@us.ibm.com>
------
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index b9e002f..6fb788b 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -80,33 +80,48 @@ static inline struct skb_vnet_hdr *skb_vnet_hdr(struct sk_buff *skb)
return (struct skb_vnet_hdr *)skb->cb;
}
-static void give_a_page(struct virtnet_info *vi, struct page *page)
+static void give_pages(struct virtnet_info *vi, struct page *page)
{
- page->private = (unsigned long)vi->pages;
+ struct page *npage = (struct page *)page->private;
+
+ if (!npage)
+ page->private = (unsigned long)vi->pages;
+ else {
+ /* give a page list */
+ while (npage) {
+ if (npage->private == (unsigned long)0) {
+ npage->private = (unsigned long)vi->pages;
+ break;
+ }
+ npage = (struct page *)npage->private;
+ }
+ }
vi->pages = page;
}
-static void trim_pages(struct virtnet_info *vi, struct sk_buff *skb)
-{
- unsigned int i;
-
- for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
- give_a_page(vi, skb_shinfo(skb)->frags[i].page);
- skb_shinfo(skb)->nr_frags = 0;
- skb->data_len = 0;
-}
-
static struct page *get_a_page(struct virtnet_info *vi, gfp_t gfp_mask)
{
struct page *p = vi->pages;
- if (p)
+ if (p) {
vi->pages = (struct page *)p->private;
- else
+ /* use private to chain big packets */
+ p->private = (unsigned long)0;
+ } else
p = alloc_page(gfp_mask);
return p;
}
+void virtio_free_pages(void *buf)
+{
+ struct page *page = (struct page *)buf;
+
+ while (page) {
+ __free_pages(page, 0);
+ page = (struct page *)page->private;
+ }
+}
+
static void skb_xmit_done(struct virtqueue *svq)
{
struct virtnet_info *vi = svq->vdev->priv;
@@ -118,12 +133,36 @@ static void skb_xmit_done(struct virtqueue *svq)
netif_wake_queue(vi->dev);
}
...Interesting use after free :) --
Good catch. This code was run when virtio_net removal. I run many times
of remove virtio_net, and didn't hit any panic :(. Fixed it as below:
void virtio_free_pages(void *buf)
{
struct page *page = (struct page *)buf;
struct page *npage;
while (page) {
npage = page->private;
__free_pages(page, 0);
page = npage;
}
}
Thanks
Shirley
--
Thanks for catching the stupid mistake. This is the updated patch for review. Signed-off-by: Shirley Ma (xma@us.ibm.com) ------ diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index b9e002f..5699bd3 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -80,33 +80,50 @@ static inline struct skb_vnet_hdr *skb_vnet_hdr(struct sk_buff *skb) return (struct skb_vnet_hdr *)skb->cb; } -static void give_a_page(struct virtnet_info *vi, struct page *page) +static void give_pages(struct virtnet_info *vi, struct page *page) { - page->private = (unsigned long)vi->pages; + struct page *npage = (struct page *)page->private; + + if (!npage) + page->private = (unsigned long)vi->pages; + else { + /* give a page list */ + while (npage) { + if (npage->private == (unsigned long)0) { + npage->private = (unsigned long)vi->pages; + break; + } + npage = (struct page *)npage->private; + } + } vi->pages = page; } -static void trim_pages(struct virtnet_info *vi, struct sk_buff *skb) -{ - unsigned int i; - - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) - give_a_page(vi, skb_shinfo(skb)->frags[i].page); - skb_shinfo(skb)->nr_frags = 0; - skb->data_len = 0; -} - static struct page *get_a_page(struct virtnet_info *vi, gfp_t gfp_mask) { struct page *p = vi->pages; - if (p) + if (p) { vi->pages = (struct page *)p->private; - else + /* use private to chain big packets */ + p->private = (unsigned long)0; + } else p = alloc_page(gfp_mask); return p; } +void virtio_free_pages(void *buf) +{ + struct page *page = (struct page *)buf; + struct page *npage; + + while (page) { + npage = (struct page *)page->private; + __free_pages(page, 0); + page = npage; + } +} + static void skb_xmit_done(struct virtqueue *svq) { struct virtnet_info *vi = svq->vdev->priv; @@ -118,12 +135,36 @@ static void skb_xmit_done(struct virtqueue *svq) netif_wake_queue(vi->dev); } -static void ...
Hi Shirley,
This patch seems like a good idea, but it needs some work. As this is
the first time I've received a patch from you, I will go through it in extra
detail. (As virtio maintainer, it's my responsibility to include this patch,
The loops should cover both cases of the if branch. Either way, we want
to find the last page in the list so you can set that ->private pointer to
vi->pages.
Also, the "== (unsigned long)0" is a little verbose.
How about:
struct page *end;
/* Find end of list, sew whole thing into vi->pages. */
for (end = page; end->private; end = (struct page *)end->private);
end->private = (unsigned long)vi->pages;
This is a terrible name; it's specific to virtio_net, plus it should be
This smells a lot like a for loop to me?
struct page *i, *next;
for (i = buf; next; i = next) {
next = (struct page *)i->private;
__free_pages(i, 0);
This cast is unnecessary, but a comment would be nice:
And a matching comment here:
Really? I can't see where the current driver does this 6 byte offset. There
should be a header, then the packet...
Ah, I see below that you set things up this way. The better way to do this
is to create a new structure to use in various places.
struct padded_vnet_hdr {
struct virtio_net_hdr hdr;
/* This padding makes our packet 16 byte aligned */
char padding[6];
};
However, I question whether making it 16 byte is the right thing: the
I think you can move the memcpy outside the if, like so:
I can't help but think this statement should be one loop somehow. Something
like:
offset += copy;
while (len) {
len = skb_add_frag(skb, page, offset, len);
page = (struct page *)page->private;
offset = 0;
}
if (page)
The result of trying to merge all the cases here is messy. I'd split it
inside the loop into: add_recvbuf_small(vi, gfp), add_recvbuf_big(vi, gfp)
and add_recvbuf_mergeable(vi, gfp).
It'll also be easier for me to review each case then, as well as ...Because in QEMU it requires 10 bytes header in a separately, so one page is used to share between virtio_net_hdr header which is 10 bytes head and rest of data. So I put 6 bytes offset here between two buffers. I didn't look at the reason why a seperate buf is used for virtio_net_hdr I was little bit worried about qemu messed up len (i saw code in mergeable buffer checking len > PAGE_SIZE), so I put page checking I will try to separate this patch. Thanks Shirley --
Hi Shirley,
Looks like buf is a void *, so no cast should be necessary. But I could
It's a qemu bug. It insists the header be an element in the scatterlist by
itself. Unfortunately we have to accommodate it.
However, there's no reason for the merged rxbuf and big packet layout to be
the same: for the big packet case you should layout as follows:
#define BIG_PACKET_PAD (NET_SKB_PAD - sizeof(struct virtio_net_hdr) + NET_IP_ALIGN)
struct big_packet_page {
struct virtio_net_hdr hdr;
char pad[BIG_PACKET_PAD];
/* Actual packet data starts here */
unsigned char data[PAGE_SIZE - BIG_PACKET_PAD - sizeof(struct virtio_net_hdr)];
};
Then use this type as the template for registering the sg list for the
Yes, that has to stay inside, but the memcpy can move out. It's just a bit
neater to have more common code.
Thanks,
Rusty.
--
The padding was used for qemu userspace buffer copy, skb data is copied from the page for size of GOOD_COPY_LEN, and skb data is reserved NET_IP_ALIGN. If we add paddings as above, userspace copy will starts NET_SKB_PAD + num_buf is assigned after memcpy so we couldn't move memcpy out. Anyway I separated mergeable buffers and big packets in the new patch to make it clear and it will be easy to modify when we drop big packets support in the future. Thanks Shirley --
We do? Let's just fix this?
All we have to do is replace memcpy with proper iovec walk, correct?
Something like the followng (untested) patch? It's probably not too
late to put this in the next qemu release...
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
diff --git a/hw/virtio-net.c b/hw/virtio-net.c
index 2f147e5..06c5148 100644
--- a/hw/virtio-net.c
+++ b/hw/virtio-net.c
@@ -434,26 +434,59 @@ static int iov_fill(struct iovec *iov, int iovcnt, const void *buf, int count)
return offset;
}
+static int iov_skip(struct iovec *iov, int iovcnt, int count)
+{
+ int offset, i;
+
+ offset = i = 0;
+ while (offset < count && i < iovcnt) {
+ int len = MIN(iov[i].iov_len, count - offset);
+ iov[i].iov_base += len;
+ iov[i].iov_len -= len;
+ offset += len;
+ i++;
+ }
+
+ return offset;
+}
+
+static int iov_copy(struct iovec *to, struct iovec *from, int iovcnt, int count)
+{
+ int offset, i;
+
+ offset = i = 0;
+ while (offset < count && i < iovcnt) {
+ int len = MIN(from[i].iov_len, count - offset);
+ to[i].iov_base = from[i].iov_base;
+ to[i].iov_len = from[i].iov_len;
+ offset += len;
+ i++;
+ }
+
+ return i;
+}
+
static int receive_header(VirtIONet *n, struct iovec *iov, int iovcnt,
const void *buf, size_t size, size_t hdr_len)
{
- struct virtio_net_hdr *hdr = (struct virtio_net_hdr *)iov[0].iov_base;
+ struct virtio_net_hdr hdr = {};
int offset = 0;
- hdr->flags = 0;
- hdr->gso_type = VIRTIO_NET_HDR_GSO_NONE;
+ hdr.flags = 0;
+ hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE;
if (n->has_vnet_hdr) {
- memcpy(hdr, buf, sizeof(*hdr));
- offset = sizeof(*hdr);
- work_around_broken_dhclient(hdr, buf + offset, size - offset);
+ memcpy(&hdr, buf, sizeof hdr);
+ offset = sizeof hdr;
+ work_around_broken_dhclient(&hdr, buf + offset, size - offset);
}
+ ...So does lguest. It's been that way since the beginning. Fixing this would result in breaking older guests. We really need to introduce a feature bit if we want to change this. Regards, Anthony Liguori --
-- MST --
It does? All I see it doing is writev/readv, and this passes things to tap which handles I am not sure I agree: we can't add feature bits for all bugs, can we? -- MST --
Agreed, we can't "fix" it in the guests, but it's a wart. That's why I haven't bothered fixing it, but if someone else wants to I'll cheer all the way. lguest did it because I knew I could fix lguest any time; it was a bad mistake I don't think it's worth it. But the spec does say that the implementation should not rely on the framing (I think there's a note that current implementations are buggy tho, so you should frame it separately anyway). That way future devices can get it right, at least. Thanks, Rusty. --
You might want to implement a more generic helper which does:
/* Return pointer into iovec if we can, otherwise copy into buf */
void *pull_iovec(struct iovec *iov, int iovcnt, void *buf, size_t len)
{
unsigned int i;
void *p;
if (likely(iov_cnt && iov[0].iov_len >= len)) {
/* Nice contiguous chunk. */
void *p = iov[0].iov_base;
iov[i].iov_base += len;
iov[i].iov_len -= len;
return p;
}
p = buf;
for (i = 0; i < iov_cnt; i++) {
size_t this_len = min(len, iov[i].iov_len);
memcpy(p, iov[i].iov_base, this_len);
len -= this_len;
iov[i].iov_base += len;
iov[i].iov_len -= len;
if (len == 0)
return buf;
}
/* BTW, we screwed your iovec. */
return NULL;
}
Then use it in all the virtio drivers...
Thanks!
Rusty.
--
Hmm, is it really worth it to save a header copy if it's linear? We are going to access it anyway, and it fits into one cacheline nicely. On the other hand we have more code making life harder for compiler and --
Not sure: I think there would be many places where it would be useful. We do a similar thing in the kernel to inspect non-linear packets, and it's served us well. Cheers, Rusty. --
--
Hello Rusty, I have split the patch into a set: cleanups, new infrastructure, and actual allocation change in add buffers: add_recvbuf_big, add_recvbuf_small, add_recvbuf_mergeage per your suggestion, and also in recv buffers: recv_big, recv_small, recv_mergeable. I hope you will agree with it. I am testing the patch-set now, will submit it soon after finishing all test cases. Thanks Shirley --
some style comments. addressing them will make it
should be !npage->private
and nesting is too deep here:
this is cleaner in a give_a_page subroutine
the comment is not really helpful:
you say you use private to chain but 0 does not
please think of a way to get rid of magic constants like 6 and 2
replace MAX_SKB_FRAGS + 2 with something symbolic? We have it in 2 palces now.
terrible goto based loop
move stuff into subfunction, it will be much
more manageable, and convert this to a simple
this is pretty hairy ... has to be this way?
What you are trying to do here
is fill buffer with pages, in a loop, with first one
using a partial page, and then add it.
Is that it?
So please code this in a straight forward manner.
it should be as simple as:
offset = XXX
for (i = 0; i < MAX_SKB_FRAGS + 2; ++i) {
sg_set_buf(sg + i, p + offset, PAGE_SIZE - offset);
offset = 0;
}
space around +
--
I think I should use MAX_SKB_FRAGS + 2 instead. Now I only use Ok. I will integrate these comments with Rusty's and resubmit the patch set. Thanks Shirley --
