For virtualization, we've developed virtio_ring for efficient communication.
This would also work well for userspace-kernel communication, particularly
for things like the tun device. By using the same ABI, we can join guests
to the host kernel trivially.
These patches are fairly alpha; I've seen some network stalls I have to
track down and there are some fixmes.
Comments welcome!
Rusty.
diff -r 99132ad16999 Documentation/test_vring.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/Documentation/test_vring.c Sat Apr 05 21:31:40 2008 +1100
@@ -0,0 +1,47 @@
+#include <unistd.h>
+#include <linux/virtio_ring.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <err.h>
+#include <poll.h>
+
+#ifndef __NR_vringfd
+#define __NR_vringfd 327
+#endif
+
+int main()
+{
+ int fd, r;
+ struct vring vr;
+ uint16_t used = 0;
+ struct pollfd pfd;
+ void *buf = calloc(vring_size(256, getpagesize()), 0);
+
+ vring_init(&vr, 256, buf, getpagesize());
+
+ fd = syscall(__NR_vringfd, buf, 256, &used);
+ if (fd < 0)
+ err(1, "vringfd gave %i", fd);
+
+ pfd.fd = fd;
+ pfd.events = POLLIN;
+ r = poll(&pfd, 1, 0);
+
+ if (r != 0)
+ err(1, "poll gave %i", r);
+
+ vr.used->idx++;
+ r = poll(&pfd, 1, 0);
+
+ if (r != 1)
+ err(1, "poll after buf used gave %i", r);
+
+ used++;
+ r = poll(&pfd, 1, 0);
+
+ if (r != 0)
+ err(1, "poll after used incremented gave %i", r);
+
+ close(fd);
+ return 0;
+}
diff -r 99132ad16999 arch/x86/kernel/syscall_table_32.S
--- a/arch/x86/kernel/syscall_table_32.S Sat Apr 05 21:20:32 2008 +1100
+++ b/arch/x86/kernel/syscall_table_32.S Sat Apr 05 21:31:40 2008 +1100
@@ -326,3 +326,4 @@ ENTRY(sys_call_table)
.long sys_fallocate
.long sys_timerfd_settime /* 325 */
.long sys_timerfd_gettime
+ .long sys_vringfd
diff -r 99132ad16999 fs/Kconfig
--- a/fs/Kconfig Sat Apr 05 21:20:32 2008 +1100
+++ b/fs/Kconfig Sat Apr 05 21:31:40 2008 +1100
@@ -2135,4 +2135,14 @@ source "fs/nls/Kconfig"
source "fs/nls/Kconfig"
source ...It turns out the lguest (and possibly kvm) want the addresses in the
ring buffer to only cover a certain part of memory, and be offset.
It makes sense that this be an ioctl.
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
diff -r 08fb00b8acab Documentation/ioctl-number.txt
--- a/Documentation/ioctl-number.txt Sat Apr 05 21:31:40 2008 +1100
+++ b/Documentation/ioctl-number.txt Sat Apr 05 22:00:10 2008 +1100
@@ -183,6 +183,7 @@ 0xAC 00-1F linux/raw.h
0xAC 00-1F linux/raw.h
0xAD 00 Netfilter device in development:
<mailto:rusty@rustcorp.com.au>
+0xAE 00-01 linux/vring.h
0xB0 all RATIO devices in development:
<mailto:vgo@ratio.de>
0xB1 00-1F PPPoX <mailto:mostrows@styx.uwaterloo.ca>
diff -r 08fb00b8acab fs/vring.c
--- a/fs/vring.c Sat Apr 05 21:31:40 2008 +1100
+++ b/fs/vring.c Sat Apr 05 22:00:10 2008 +1100
@@ -38,6 +38,8 @@ struct vring_info
u16 mask;
u16 __user *last_used;
u16 last_avail;
+
+ unsigned long base, limit;
const struct vring_ops *ops;
void *ops_data;
@@ -120,10 +122,30 @@ static int vring_release(struct inode *i
return 0;
}
+static int vring_ioctl(struct inode *in, struct file *filp,
+ unsigned int cmd, unsigned long arg)
+{
+ struct vring_info *vr = filp->private_data;
+
+ switch (cmd) {
+ case VRINGSETBASE:
+ vr->base = arg;
+ break;
+ case VRINGSETLIMIT:
+ vr->limit = arg;
+ break;
+ default:
+ return -ENOTTY;
+ }
+
+ return 0;
+}
+
static const struct file_operations vring_fops = {
.release = vring_release,
.write = vring_write,
.poll = vring_poll,
+ .ioctl = vring_ioctl,
};
asmlinkage long sys_vringfd(void __user *addr,
@@ -166,6 +188,8 @@ asmlinkage long sys_vringfd(void __user
vr->mask = num_descs - 1;
vr->ops = NULL;
vr->used = NULL;
+ vr->limit = -1UL;
+ vr->base = 0;
err = get_user(vr->last_avail, &vr->ring.avail->idx);
if (err)
@@ -208,12 +232,15 @@ int vring_get_buffer(struct vring_info *
out_len = &dummy;
*in_len = ...This patch modifies tun to allow a vringfd to specify the receive
buffer. Because we can't copy to userspace in bh context, we queue
like normal then use the "pull" hook to actually do the copy.
More thought needs to be put into the possible races with ring
registration and a simultaneous close, for example (see FIXME).
We use struct virtio_net_hdr prepended to packets in the ring to allow
userspace to receive GSO packets in future (at the moment, the tun
driver doesn't tell the stack it can handle them, so these cases are
never taken).
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
diff -r 285c3112b26c Documentation/test_vring.c
--- a/Documentation/test_vring.c Sat Apr 05 22:00:10 2008 +1100
+++ b/Documentation/test_vring.c Sat Apr 05 22:15:56 2008 +1100
@@ -1,21 +1,62 @@
#include <unistd.h>
#include <linux/virtio_ring.h>
+#include <linux/ioctl.h>
+#include <linux/if_tun.h>
#include <stdio.h>
#include <stdint.h>
+#include <string.h>
#include <err.h>
#include <poll.h>
+#include <net/if.h>
+#include <netinet/in.h>
+#include <linux/sockios.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <sys/types.h>
#ifndef __NR_vringfd
#define __NR_vringfd 327
#endif
+/* This sets up the Host end of the network device with an IP address, brings
+ * it up so packets will flow, the copies the MAC address into the hwaddr
+ * pointer. */
+static void configure_device(int fd, const char *devname, uint32_t ipaddr,
+ unsigned char hwaddr[6])
+{
+ struct ifreq ifr;
+ struct sockaddr_in *sin = (struct sockaddr_in *)&ifr.ifr_addr;
+
+ /* Don't read these incantations. Just cut & paste them like I did! */
+ memset(&ifr, 0, sizeof(ifr));
+ strcpy(ifr.ifr_name, devname);
+ sin->sin_family = AF_INET;
+ sin->sin_addr.s_addr = htonl(ipaddr);
+ if (ioctl(fd, SIOCSIFADDR, &ifr) != 0)
+ err(1, "Setting %s interface address", devname);
+ ifr.ifr_flags = IFF_UP;
+ if (ioctl(fd, SIOCSIFFLAGS, &ifr) != 0)
+ err(1, "Bringing interface %s up", ...This patch modifies tun to allow a vringfd to specify the send
buffer. The user does a write to push out packets from the buffer.
Again, more thought needs to be put into the possible races with ring
registration.
Again we use the 'struct virtio_net_hdr' to allow userspace to send
GSO packets. In this case, it can hint how much to copy, and the
other pages will be made into skb fragments.
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
diff -r 8270b5fdf03f drivers/net/tun.c
--- a/drivers/net/tun.c Sat Apr 05 22:49:10 2008 +1100
+++ b/drivers/net/tun.c Sat Apr 05 22:51:10 2008 +1100
@@ -101,7 +101,7 @@ struct tun_struct {
u32 chr_filter[2];
u32 net_filter[2];
- struct vring_info *inring;
+ struct vring_info *inring, *outring;
#ifdef TUN_DEBUG
int debug;
@@ -258,6 +258,162 @@ static void tun_net_init(struct net_devi
}
}
+/* We don't consolidate consecutive iovecs, so huge iovecs can break here.
+ * Users will learn not to do that. */
+static int get_user_skb_frags(const struct iovec *iv, size_t len,
+ struct skb_frag_struct *f)
+{
+ unsigned int i, j, num_pg = 0;
+ int err;
+ struct page *pages[MAX_SKB_FRAGS];
+
+ down_read(&current->mm->mmap_sem);
+ while (len) {
+ int n, npages;
+ unsigned long base, len;
+ base = (unsigned long)iv->iov_base;
+ len = (unsigned long)iv->iov_len;
+
+ if (len == 0) {
+ iv++;
+ continue;
+ }
+
+ /* How many pages will this take? */
+ npages = 1 + (base + len - 1)/PAGE_SIZE - base/PAGE_SIZE;
+ if (unlikely(num_pg + npages > MAX_SKB_FRAGS)) {
+ err = -ENOSPC;
+ goto fail;
+ }
+ n = get_user_pages(current, current->mm, base, npages,
+ 0, 0, pages, NULL);
+ if (unlikely(n < 0)) {
+ err = n;
+ goto fail;
+ }
+
+ /* Transfer pages to the frag array */
+ for (j = 0; j < n; j++) {
+ f[num_pg].page = pages[j];
+ if (j == 0) {
+ f[num_pg].page_offset = offset_in_page(base);
+ f[num_pg].size = min(len, PAGE_SIZE -
+ ...This is how lguest uses the vringfd tun support. It needs more cleanup,
but it seems to basically work.
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
diff -r 6979348a6ece Documentation/lguest/lguest.c
--- a/Documentation/lguest/lguest.c Sat Apr 05 22:02:28 2008 +1100
+++ b/Documentation/lguest/lguest.c Sat Apr 05 22:12:25 2008 +1100
@@ -43,6 +43,7 @@
#include "linux/virtio_console.h"
#include "linux/virtio_rng.h"
#include "linux/virtio_ring.h"
+#include "linux/vring.h"
#include "asm-x86/bootparam.h"
/*L:110 We can ignore the 39 include files we need for this program, but I do
* want to draw attention to the use of kernel-style types.
@@ -56,6 +57,10 @@ typedef uint16_t u16;
typedef uint16_t u16;
typedef uint8_t u8;
/*:*/
+
+#ifndef __NR_vringfd
+#define __NR_vringfd 327
+#endif
#define PAGE_PRESENT 0x7 /* Present, RW, Execute */
#define NET_PEERNUM 1
@@ -101,6 +106,9 @@ struct device_list
/* The descriptor page for the devices. */
u8 *descpage;
+
+ /* Pointer to last used in descpage */
+ u8 *nextdesc;
/* A single linked list of devices. */
struct device *dev;
@@ -853,6 +861,13 @@ static void handle_console_output(int fd
* and write them (ignoring the first element) to this device's file descriptor
* (/dev/net/tun).
*/
+struct virtio_net_info
+{
+ struct virtqueue *xmit_vq, *recv_vq;
+ u16 xmit_used, recv_used;
+ int xmitfd;
+};
+
static void handle_net_output(int fd, struct virtqueue *vq)
{
unsigned int head, out, in;
@@ -870,6 +885,15 @@ static void handle_net_output(int fd, st
len = writev(vq->dev->fd, iov+1, out-1);
add_used_and_trigger(fd, vq, head, len);
}
+}
+
+static void handle_netring_output(int fd, struct virtqueue *vq)
+{
+ struct virtio_net_info *ni = vq->dev->priv;
+
+ /* We have output, kick the kernel. */
+ if (write(ni->xmitfd, "", 0) != 0)
+ err(1, "Writing to xmitfd");
}
/* This is where we handle a packet coming in from the tun device to our
@@ -1054,18 +1078,13 ...On second thought, this is not going to work. The network stack can clone individual pages out of this skb and put it into a new skb. Therefore whatever scheme we come up with will either need to be page-based, or add a flag to tell the network stack that it can't clone those pages. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt --
Erk... I'll put in the latter for now. A page-level solution is not really an option: if userspace hands us mmaped pages for example. Thanks, Rusty. --
From: Rusty Russell <rusty@rustcorp.com.au> Keep in mind that the core of the TCP stack really depends upon being able to slice and dice paged SKBs as is pleases in order to send packets out. In fact, it also does such splitting during SACK processing. It really is a base requirement for efficient TSO support. Otherwise the above operations would be so incredibly expensive we might as well rip all of the TSO support out. --
In this case that's OK; these are only for packets coming from userspace/guest, so the only interaction with the tcp code is possibly as a recipient. If tcp wanted to do zero-copy aio xmit, I think we'd need something else. Cheers, Rusty. --
In general the code looks good. The only thing I could not convince myself in is whether having generic ring buffer makes sense or not. At least the TUN driver would be more efficient if it had its own simple ring implementation. Less indirection, fewer callbacks, fewer if()s, etc. TUN already has the file descriptor and having two additional fds for rx and tx ring is a waste (think of a VPN server that has to have a bunch of TUN fds). Also as I mentioned before Jamal and I wanted to expose some of the SKB fields through TUN device. With the rx/tx rings the natural way of doing that would be the ring descriptor itself. It can of course be done the same way we copy proto info (PI) and GSO stuff before the packet but that means more copy_to_user() calls and yet more checks. So. What am I missing ? Why do we need generic ring for the TUN ? I looked at the lguest code a bit and it seems that we need a bunch of network specific code anyway. The cool thing is that you can now mmap the rings into the guest directly but the same thing can be done with TUN specific rings. Max --
The idea was to use the same virtio ring that the guests use. The thing with TUN specific ring is that the guests are the one allocating the rings within their memory space and the opposite makes life very complex. Cheers, --
We can do the same thing with TUN rings. I mean have them allocated in the guest space. With that we'd still have all of the advantages that I listed above. ie We'd have ring descriptors that carry packet info, less indirection, etc. Max --
I started modifying tun to do this directly, but it ended up with a whole heap
of code just for the rings, and a lot of current code (eg. read, write, poll)
ended up inside an 'if (tun->rings) ... else {'. Having a natural poll()
interface for the rings made more sense, so being their own fds fell out
naturally.
I decided to float this version because it does minimal damage to tun, and I
know that other people have wanted rings before: I'd like to know if this is
likely to be generic enough for them.
Thanks!
Rusty
--
Hmm, the version that I sent you awhile ago (remember I sent you an attachment with prototype of the new tun driver and user space code) was not that bad in that area. It mean it did not touch existing read()/write() path. The difference was that it allocated the rings and the data buffer in the kernel and mapped into the user-space. Which is not what you guys need but that's a separate thing. The fd thing could be an issue. As I mentioned the example would be a VPN server (OpenVPN, etc) with a bunch of client connection (typically tun per I see. I'll try to spend some time on this in a near future and take a crack at the version with the TUN specific rings. Although I said that many times now and it may not happen in the near enough future :). In the mean time if your current version helps you guys a lot I do not mind us putting it in. We can always add another mode or something that uses internal rings and gradually obsolete old read()/write() and generic rings. Max --
Another virtualization subsystem already uses this range. -- Any sufficiently difficult bug is indistinguishable from a feature. --
Nasty. Not listed in the ioctl list, and didn't find it with grep 8(. I don't think I ever used 0xAD which was assigned to me, so I can change to that. Thanks, Rusty. --
This should also set the .compat_ioctl pointer. If the argument is defined as an actual memory location instead of a pointer, you also don't need the compat_ptr() conversion for s390x, so both can point to the same vring_ioctl function. Arnd <>< --
I'm *sure* you meant to document that somewhat non-trivial proposed new kernel API as soon as you got a moment. But, since I went to the trouble of reverse engineering it, I decided not to wait and to make a little unreliable guide of my own. For those who are curious about how vringfd() is meant to work (or about how badly I misunderstood it), the info is at: http://lwn.net/SubscriberLink/276856/8c927025c53ad7ce/ Hopefully it will be helpful, jon --
Actually, yes. But I wanted to get it out there before I start the treck across to the virtualization summit. A few points: 'The page alignment for the used array is important - that array might be mapped separately into kernel space.' Well, the used array is written by one side only, so it's possible to split the ring here and make each part r/o to the other side. More importantly, a page boundary is almost certainly a cacheline boundary, and we already have a userspace interface for it. 'Note that the flags fields in the vring_avail and vring_used structures appear to be unused.' virtio uses these for wakeup/interrupt suppression. It's a cheap way to avoid hypercalls, and we can use them the same way to avoid system calls (you set the suppression bit while you're actually looking at the ring). The need for the kmap (and hence the atomic horror) has now been alleviated: I changed the shinfo destructor code to allow the destructor to hold onto the skb data so it can queue it and free it later. BTW, the only place currently where both output and input buffers are used is the virtio_blk driver doing a read, where the header describes the operation, and the other buffers are overwritten with the data. Thanks! Rusty. --
This looks like a candidate for anon_inode_getfd(), which would let you get rid of the code for registering your own file system. Arnd <><
65536 doesn't fit in u16.
J
--
Hi Rusty, A couple comments below. I suppose you are doing data copy in ->poll instead of ->read to save a system call? This is weird, not conformant to what the interface is supposed to do. This way select/poll syscalls might block in userspace datacopy. The application might have a higher priority fd in the fdset to be informed of, for example. So why not change this to the common arrangement, with vring_poll adding Can't the same be achieved by the app mlocking the vring pages, which then goes through standard rlimit checking ? --
Oh, this is a driver API to allow the copy to take place in atomic contexes. You might want to add some sort of limit enforcement. Also forgot mmap_sem there. --
Hi Marcelo, Thanks for reading this. Yeah, I dislike it too. The main motivation was that not all rings need to be "pulled" in this way, but it turns out that userspace can detect this (poll, check ring, if nothing, try read, check ring). So next version will Tun/tap writes to this from any context. Fortunately, this too is changing, as I came up with a sane way of queueing to userspace context for this case. Expect updated patches Monday. Thanks! Rusty. --
