Stephen Hemminger writes: > Eric wisely pointed out that for larger sizes of nodes, the > current allocation in fib_trie wastes lots of memory. For a sample > of routes extracted from the bugzilla bug the largest node grows > to 2M bytes on 64 bit system. This leads to 2044K of wasted memory. > > There are two possible solutions (see attached). One uses vmalloc() > rather than alloc_pages, but has to add complexity on freeing. > The other adds a layer of indirection to the tnode lookup. > > Both have been tested on net-2.6.26 with the huge route table. > I slightly prefer the vmalloc version, but both work fine. Do we get slower with vmalloc due to TLB-lookups etc? Guess this should be investigated. http://mail.nl.linux.org/kernelnewbies/2005-12/msg00212.html Memory savings (different) Cheers --ro Current BGP here. Aver depth: 2.58 Max depth: 6 Leaves: 238903 Internal nodes: 58049 1: 30668 2: 11458 3: 9214 4: 3904 5: 1738 6: 721 7: 271 8: 74 16: 1 Pointers: 464272 Null ptrs: 167321 Total size: 7841 kB > > IPV4: fib_trie use vmalloc for large tnodes > > Use vmalloc rather than alloc_pages to avoid wasting memory. > The problem is that tnode structure has a power of 2 sized array, > plus a header. So the current code wastes almost half the memory > allocated because it always needs the next bigger size to hold > that small header. > > This is similar to an earlier patch by Eric, but instead of a list > and lock, I used a workqueue to handle the fact that vfree can't > be done in interrupt context. > > Signed-off-by: Stephen Hemminger <shemminger@vyatta.com> > > --- > net/ipv4/fib_trie.c | 25 +++++++++++++++---------- > 1 file changed, 15 insertions(+), 10 deletions(-) > > --- a/net/ipv4/fib_trie.c 2008-04-04 08:57:01.000000000 -0700 > +++ b/net/ipv4/fib_trie.c 2008-04-04 08:57:03.000000000 -0700 > @@ -122,7 +122,10 @@ struct tnode { > unsigned char bits; /* 2log(KEYLENGTH) bits needed */ > unsigned int full_children; /* KEYLENGTH bits needed */ > unsigned int empty_children; /* KEYLENGTH bits needed */ > - struct rcu_head rcu; > + union { > + struct rcu_head rcu; > + struct work_struct work; > + }; > struct node *child[0]; > }; > > @@ -350,16 +353,16 @@ static inline void free_leaf_info(struct > > static struct tnode *tnode_alloc(size_t size) > { > - struct page *pages; > - > if (size <= PAGE_SIZE) > return kzalloc(size, GFP_KERNEL); > + else > + return __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL); > +} > > - pages = alloc_pages(GFP_KERNEL|__GFP_ZERO, get_order(size)); > - if (!pages) > - return NULL; > - > - return page_address(pages); > +static void __tnode_vfree(struct work_struct *arg) > +{ > + struct tnode *tn = container_of(arg, struct tnode, work); > + vfree(tn); > } > > static void __tnode_free_rcu(struct rcu_head *head) > @@ -370,8 +373,10 @@ static void __tnode_free_rcu(struct rcu_ > > if (size <= PAGE_SIZE) > kfree(tn); > - else > - free_pages((unsigned long)tn, get_order(size)); > + else { > + INIT_WORK(&tn->work, __tnode_vfree); > + schedule_work(&tn->work); > + } > } > > static inline void tnode_free(struct tnode *tn) > IPV4: fib_trie split child off from tnode > > For large tnode's the power of 2 allocation wastes almost half the space > since the tnode is allocated with the power of 2 sized child pointers. > To solve this add a layer of indirection. For small nodes (the common case) > can just use a single allocation, for larger use pages like before. > > Signed-off-by: Stephen Hemminger <shemminger@vyatta.com> > > > --- > net/ipv4/fib_trie.c | 43 ++++++++++++++++++++++++++----------------- > 1 file changed, 26 insertions(+), 17 deletions(-) > > --- a/net/ipv4/fib_trie.c 2008-04-03 09:09:45.000000000 -0700 > +++ b/net/ipv4/fib_trie.c 2008-04-03 22:08:33.000000000 -0700 > @@ -120,10 +120,10 @@ struct tnode { > t_key key; > unsigned char pos; /* 2log(KEYLENGTH) bits needed */ > unsigned char bits; /* 2log(KEYLENGTH) bits needed */ > + struct node **child; > unsigned int full_children; /* KEYLENGTH bits needed */ > unsigned int empty_children; /* KEYLENGTH bits needed */ > struct rcu_head rcu; > - struct node *child[0]; > }; > > #ifdef CONFIG_IP_FIB_TRIE_STATS > @@ -348,30 +348,40 @@ static inline void free_leaf_info(struct > call_rcu(&leaf->rcu, __leaf_info_free_rcu); > } > > -static struct tnode *tnode_alloc(size_t size) > +static struct tnode *tnode_alloc(int bits) > { > - struct page *pages; > - > - if (size <= PAGE_SIZE) > - return kzalloc(size, GFP_KERNEL); > + size_t space = sizeof(struct node *) << bits; > + size_t size = space + sizeof(struct tnode); > + struct tnode *tn; > > - pages = alloc_pages(GFP_KERNEL|__GFP_ZERO, get_order(size)); > - if (!pages) > + tn = kzalloc(size > PAGE_SIZE ? sizeof(struct tnode) : size, GFP_KERNEL); > + if (!tn) > return NULL; > > - return page_address(pages); > + if (size <= PAGE_SIZE) > + tn->child = (struct node **) (tn + 1); > + else { > + struct page *pages = alloc_pages(GFP_KERNEL|__GFP_ZERO, > + get_order(space)); > + if (!pages) { > + kfree(tn); > + return NULL; > + } > + tn->child = page_address(pages); > + } > + > + return tn; > } > > static void __tnode_free_rcu(struct rcu_head *head) > { > struct tnode *tn = container_of(head, struct tnode, rcu); > - size_t size = sizeof(struct tnode) + > - (sizeof(struct node *) << tn->bits); > + size_t space = sizeof(struct node *) << tn->bits; > + size_t size = space + sizeof(struct tnode); > > - if (size <= PAGE_SIZE) > - kfree(tn); > - else > - free_pages((unsigned long)tn, get_order(size)); > + if (size > PAGE_SIZE) > + free_pages((unsigned long)tn->child, get_order(space)); > + kfree(tn); > } > > static inline void tnode_free(struct tnode *tn) > @@ -404,8 +414,7 @@ static struct leaf_info *leaf_info_new(i > > static struct tnode *tnode_new(t_key key, int pos, int bits) > { > - size_t sz = sizeof(struct tnode) + (sizeof(struct node *) << bits); > - struct tnode *tn = tnode_alloc(sz); > + struct tnode *tn = tnode_alloc(bits); > > if (tn) { > tn->parent = T_TNODE; -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
| Karl Meyer | PROBLEM: 2.6.23-rc "NETDEV WATCHDOG: eth0: transmit timed out" |
| Greg Kroah-Hartman | [PATCH 040/196] kobject: add kobject_add_ng function |
| Steven Rostedt | [RFC PATCH v4] Unified trace buffer |
| Dave Airlie | [git pull] drm patches for 2.6.27 final |
| Krzysztof Halasa | Re: [PATCH v2] Re: WAN: new PPP code for generic HDLC |
| David Miller | Re: [PATCH] Expose netdevice dev_id through sysfs |
| Jay Cliburn | Re: atl1 64-bit => 32-bit DMA borkage (reproducible, bisected) |
| Evgeniy Polyakov | [resend take 2 0/4] Distributed storage. |
git: | |
| Andrew Morton | Untracked working tree files |
| Miklos Vajna | [rfc] git submodules howto |
| Ben Collins | Re: [kernel.org users] [RFD] On deprecating "git-foo" for builtins |
| Jon Smirl | ! [rejected] master -> master (non-fast forward) |
| rancor | How to copy/pipe console buffert to file? |
| Pieter Verberne | File collision while using pkg_add |
| Greg Thomas | Re: Is it possible to fix a stale NFS hadle without rebooting? |
| Didier Wiroth | win32-codecs, avi and amd64 question |
| Netfilter kernel module | 9 hours ago | Linux kernel |
| serial driver xmit problem | 12 hours ago | Linux kernel |
| Why Windows is better than Linux | 12 hours ago | Linux general |
| How can I see my kernel messages in vt12? | 19 hours ago | Linux kernel |
| Grub | 1 day ago | Linux general |
| vmalloc_fault handling in x86_64 | 1 day ago | Linux kernel |
| epoll_wait()ing on epoll FD | 1 day ago | Linux kernel |
| Framebuffer in x86_64 causes problems to multiseat | 1 day ago | Linux kernel |
| Difference between 2.4 and 2.6 regarding thread creation | 1 day ago | Linux general |
| Compiling gfs2 on kernel 2.6.27 | 2 days ago | Linux kernel |
