We need to use the cookie from the previous array entry, not the
actual cookie that we are searching for.
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
fs/nfs/dir.c | 9 +++++++--
1 files changed, 7 insertions(+), 2 deletions(-)
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index f0a384e..4a78d2b 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -178,6 +178,7 @@ typedef struct {
struct page *page;
unsigned long page_index;
u64 *dir_cookie;
+ u64 last_cookie;
loff_t current_index;
decode_dirent_t decode;
@@ -344,6 +345,8 @@ int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc)
else
status = nfs_readdir_search_for_cookie(array, desc);
+ if (status == -EAGAIN)
+ desc->last_cookie = array->last_cookie;
nfs_readdir_release_array(desc->page);
out:
return status;
@@ -563,7 +566,7 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
unsigned int array_size = ARRAY_SIZE(pages);
entry.prev_cookie = 0;
- entry.cookie = *desc->dir_cookie;
+ entry.cookie = desc->last_cookie;
entry.eof = 0;
entry.fh = nfs_alloc_fhandle();
entry.fattr = nfs_alloc_fattr();
@@ -672,8 +675,10 @@ int readdir_search_pagecache(nfs_readdir_descriptor_t *desc)
{
int res;
- if (desc->page_index == 0)
+ if (desc->page_index == 0) {
desc->current_index = 0;
+ desc->last_cookie = 0;
+ }
while (1) {
res = find_cache_page(desc);
if (res != -EAGAIN)
--
1.7.3.2
--
Otherwise, the VM may end up removing it while we're reading from it.
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
fs/nfs/dir.c | 16 +++++++++++++++-
1 files changed, 15 insertions(+), 1 deletions(-)
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 4a78d2b..6edef12 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -639,6 +639,7 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page* page)
static
void cache_page_release(nfs_readdir_descriptor_t *desc)
{
+ unlock_page(desc->page);
page_cache_release(desc->page);
desc->page = NULL;
}
@@ -646,8 +647,20 @@ void cache_page_release(nfs_readdir_descriptor_t *desc)
static
struct page *get_cache_page(nfs_readdir_descriptor_t *desc)
{
- return read_cache_page(desc->file->f_path.dentry->d_inode->i_mapping,
+ struct page *page;
+
+ for (;;) {
+ page = read_cache_page(desc->file->f_path.dentry->d_inode->i_mapping,
desc->page_index, (filler_t *)nfs_readdir_filler, desc);
+ if (IS_ERR(page))
+ break;
+ lock_page(page);
+ if (page->mapping != NULL && PageUptodate(page))
+ break;
+ unlock_page(page);
+ page_cache_release(page);
+ }
+ return page;
}
/*
@@ -770,6 +783,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
desc->page_index = 0;
desc->page = page;
+ lock_page(page);
status = nfs_readdir_xdr_to_array(desc, page, inode);
if (status < 0)
--
1.7.3.2
--
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
fs/nfs/inode.c | 1 +
include/linux/nfs_fs.h | 1 +
2 files changed, 2 insertions(+), 0 deletions(-)
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 314f571..0018e07 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -289,6 +289,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
} else if (S_ISDIR(inode->i_mode)) {
inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->dir_inode_ops;
inode->i_fop = &nfs_dir_operations;
+ inode->i_data.a_ops = &nfs_dir_addr_space_ops;
if (nfs_server_capable(inode, NFS_CAP_READDIRPLUS))
set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
/* Deal with crossing mountpoints */
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index c66fdb7..b5d3ab0 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -401,6 +401,7 @@ extern const struct inode_operations nfs3_file_inode_operations;
#endif /* CONFIG_NFS_V3 */
extern const struct file_operations nfs_file_operations;
extern const struct address_space_operations nfs_file_aops;
+extern const struct address_space_operations nfs_dir_addr_space_ops;
static inline struct nfs_open_context *nfs_file_open_context(struct file *filp)
{
--
1.7.3.2
--
On Tue, Nov 30, 2010 at 7:47 PM, Trond Myklebust
I don't think this is valid.
Maybe it fixes a bug, but the commit description is misleading at
best. Since you have a reference count to the page, the page is not
going away. Locking may hide some other bug (due to serializing with
other code you care about), but it is _not_ about the "VM may end up
removing it".
Even from a serialization angle, I think this patch is a bit suspect,
since readdir() will always be called under the inode semaphore, so I
think you'll always be serialized wrt other readdir users. Of course,
you may have invalidation events etc that are outside of readdir, so
...
Anyway if this patch matters, there's something else going on, and you
need to describe that.
Linus
--
I'm not worried about other readdir calls invalidating the page. My concern is rather about the VM memory reclaimers ejecting the page from the page cache, and calling nfs_readdir_clear_array while we're referencing the page. This wasn't a problem with the previous readdir code, but it will be with the new incarnation because the actual filenames are stored outside the page itself. As far as I can see, the only way to protect against that is to lock the page, perform the usual tests and then release the page lock when we're No problem. I just wanted to get the patches out so that the people who are reporting regressions can start testing. -- Trond Myklebust Linux NFS client maintainer NetApp Trond.Myklebust@netapp.com www.netapp.com --
On Tue, Nov 30, 2010 at 8:29 PM, Trond Myklebust I think you're making a fundamental mistake here, and you're confused by a much deeper problem. The thing is, the ".releasepage" callback gets called _before_ the page actually gets removed from the page cache, and there is no guarantee that it will always be removed at all! In fact, anybody holding a reference to it will result in page_freeze_refs() not successfully clearing all the refs, and that in turn will abort the actual freeing of the page. So while you hold the page count, your page will NOT be freed. Guaranteed. But it is true that the ".releasepage()" function may be called. So if your NFS release callback ends up invalidating the data on that page, that page lock thing will make a difference, yes. But at the same time, are you sure that you are able to then handle the case of that page still existing in the page cache and being used afterwards? Looking at the code, it doesn't look that way to me. So I think you're confused, and the NFS code totally incorrectly thinks that ".releasepage" is something that happens at the last use of the page. It simply is not so. In fact, you seem to return 0, which I think means "failure to release", so the VM will just mark it busy again afterwards. Now, I think you do have a few options: - keep the current model. BUT! In the page cache release function (nfs_readdir_clear_array), make sure that you also clear the up-to-date bit, so that the page gets read back in since it no longer contains any valid information. And return success for the "releasepage" operatioin. Alternatively: - introduce a callback for the case of the page actually being gone from the page cache, which gets called _after_ the removal. which seems to be what you really want, since for you the releasepage thing is about releasing the data structures associated with that cache. So you don't want to worry about the page lock, and you don't want to worry about the case of "maybe it won't get ...
This is the option I'm attempting for now. I'm quite fine with the page only being readable while under the page lock since this is readdir, and so we don't have to worry about mmap() and its ilk. My latest incarnation of the patches has added in all the PagePrivate() foo to make try_to_release_page() work, and also adds in an invalidatepage() address space operation (I rediscovered that truncate_inode_pages() will otherwise call block_invalidatepage, and subsequently Oops). I'm still in the process of testing, but the latest patchset appears to be doing all the right things for now. I'll post an update later today so that Nick and others can test too. Cheers Trond -- Trond Myklebust Linux NFS client maintainer NetApp Trond.Myklebust@netapp.com www.netapp.com --
Wait, if nfs_readdir_clear_array can clear the page while somebody else has a refcount to it, what good is the refcount? Why is your .releasepage modifying the data on the page, instead of just the metadata in the struct page? -- All rights reversed --
Hi Rik This is readdir, so there should normally be only one process accessing the page. If that process locks it (we don't have mmap() to worry about, so page locking is reasonable here) then the scheme is safe. Note that we do clear Pg_uptodate under the page lock in the latest version of the releasepage method (patches to be published later today We want to be able to free up the data that is referenced by the array on the page before the page itself is freed. Cheers Trond -- Trond Myklebust Linux NFS client maintainer NetApp Trond.Myklebust@netapp.com www.netapp.com --
Indeed, my test script passes with this patch on top of Linus' master, thanks. I'll wait for the updated series before trying the other ones. Tested-by: Nick Bowler <nbowler@elliptictech.com> -- Nick Bowler, Elliptic Technologies (http://www.elliptictech.com/) --
v2 fixes the following issues: - Fix up the cookie usage in uncached_readdir() - Changelog fixups for the second patch - .releasepage should use kmap_atomic() so that it can be called from all direct reclaim contexts. - Ensure that .releasepage also clears Pg_uptodate - Set/clear the Pg_private flag to ensure .releasepage gets called when appropriate. - Add a .invalidatepage to ensure truncate_inode_pages() works correctly - Ensure that the anonymous page that is generated by uncached_readdir() doesn't leak memory. Trond Myklebust (3): NFS: Ensure we use the correct cookie in nfs_readdir_xdr_filler NFS: lock the readdir page while it is in use NFS: Fix a memory leak in nfs_readdir fs/nfs/dir.c | 48 ++++++++++++++++++++++++++++++++++++++++-------- fs/nfs/inode.c | 1 + include/linux/nfs_fs.h | 1 + 3 files changed, 42 insertions(+), 8 deletions(-) -- 1.7.3.2 --
Before we can fix the memory leak due to not freeing up the contents
of the nfs_cache_array when clearing the page cache, we need to
ensure that shrink_page_list can't just lock the page and
call try_to_release_page() while we're accessing the array.
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
fs/nfs/dir.c | 16 +++++++++++++++-
1 files changed, 15 insertions(+), 1 deletions(-)
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index e03537f..3ec3f1c 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -639,6 +639,7 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page* page)
static
void cache_page_release(nfs_readdir_descriptor_t *desc)
{
+ unlock_page(desc->page);
page_cache_release(desc->page);
desc->page = NULL;
}
@@ -646,8 +647,20 @@ void cache_page_release(nfs_readdir_descriptor_t *desc)
static
struct page *get_cache_page(nfs_readdir_descriptor_t *desc)
{
- return read_cache_page(desc->file->f_path.dentry->d_inode->i_mapping,
+ struct page *page;
+
+ for (;;) {
+ page = read_cache_page(desc->file->f_path.dentry->d_inode->i_mapping,
desc->page_index, (filler_t *)nfs_readdir_filler, desc);
+ if (IS_ERR(page))
+ break;
+ lock_page(page);
+ if (page->mapping != NULL && PageUptodate(page))
+ break;
+ unlock_page(page);
+ page_cache_release(page);
+ }
+ return page;
}
/*
@@ -771,6 +784,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
desc->page_index = 0;
desc->last_cookie = *desc->dir_cookie;
desc->page = page;
+ lock_page(page);
status = nfs_readdir_xdr_to_array(desc, page, inode);
if (status < 0)
--
1.7.3.2
--
We need to ensure that the entries in the nfs_cache_array get cleared
when the page is removed from the page cache. To do so, we use the
releasepage address_space operation (which also requires us to set
the Pg_private flag).
Change nfs_readdir_clear_array to use kmap_atomic(), so that the
function can be safely called from all direct reclaim contexts.
Finally, modify the cache_page_release helper to call
nfs_readdir_clear_array directly, when dealing with an anonymous
page from 'uncached_readdir'.
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
fs/nfs/dir.c | 22 +++++++++++++++++-----
fs/nfs/inode.c | 1 +
include/linux/nfs_fs.h | 1 +
3 files changed, 19 insertions(+), 5 deletions(-)
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 3ec3f1c..4c6319e 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -58,6 +58,7 @@ static int nfs_rename(struct inode *, struct dentry *,
static int nfs_fsync_dir(struct file *, int);
static loff_t nfs_llseek_dir(struct file *, loff_t, int);
static int nfs_readdir_clear_array(struct page*, gfp_t);
+static void nfs_readdir_invalidatepage(struct page*, unsigned long);
const struct file_operations nfs_dir_operations = {
.llseek = nfs_llseek_dir,
@@ -85,6 +86,7 @@ const struct inode_operations nfs_dir_inode_operations = {
const struct address_space_operations nfs_dir_addr_space_ops = {
.releasepage = nfs_readdir_clear_array,
+ .invalidatepage = nfs_readdir_invalidatepage,
};
#ifdef CONFIG_NFS_V3
@@ -216,15 +218,22 @@ void nfs_readdir_release_array(struct page *page)
static
int nfs_readdir_clear_array(struct page *page, gfp_t mask)
{
- struct nfs_cache_array *array = nfs_readdir_get_array(page);
+ struct nfs_cache_array *array;
int i;
- if (IS_ERR(array))
- return PTR_ERR(array);
+ array = kmap_atomic(page, KM_USER0);
for (i = 0; i < array->size; i++)
kfree(array->array[i].string.name);
- nfs_readdir_release_array(page);
- return 0;
+ kunmap_atomic(array, ...On Wed, Dec 1, 2010 at 7:36 AM, Trond Myklebust
So I really think that the whole "releasepage" use in NFS is simply
overly complicated and was obviously too subtle.
The whole need for odd return values, for the page lock, and for the
addition of clearing the up-to-date bit comes from the fact that this
wasn't really what releasepage was designed for.
'releasepage' was really designed for the filesystem having its own
version of 'try_to_free_buffers()', which is just an optimistic "ok,
we may be releasing this page, so try to get rid of any IO structures
you have cached". It wasn't really a memory management thing.
And the thing is, it looks trivial to do the memory management
approach by adding a new callback that gets called after the page is
actually removed from the page cache. If we do that, then there are no
races with any other users, since we remove things from the page cache
atomically wrt page cache lookup. So the need for playing games with
page locking and 'uptodate' simply goes away. As does the PG_private
thing or the interaction with invalidatepage() etc.
So this is a TOTALLY UNTESTED trivial patch that just adds another
callback. Does this work? I dunno. But I get the feeling that instead
of having NFS work around the odd semantics that don't actually match
what NFS wants, introducing a new callback with much simpler semantics
would be simpler for everybody, and avoid the need for subtle code.
Hmm?
Linus
Surely somebody can have just looked up the page and gotten a reference count, right before your ->freepage call is invoked? CPU A CPU B look up page grab refcount ->freepage use contents of page Am I overlooking something obvious? -- All rights reversed --
The page is not cached any more at this point therefore looking it up won't find it. Benny --
No.
The removal from the page cache is atomic, even in the presence of the
lockless lookup.
The page cache lookup does a "get_page_unless_zero()" on the count, so
when __remove_mapping() has removed the page using
"page_freeze_refs()", it's really gone, and cannot be looked up.
And if that is broken, then we have much more serious problems (like
aliasing the same page when doing mmap/read etc), so that's more than
just an implementation detail, it's a fundamental requirement of the
whole page-cache design.
And that's the whole point of adding this callback to the
__remove_mapping() stage: that's the _only_ point where we really end
up knowing that "yes, we really removed that page, and there are no
more users".
Linus
--
Doh, you're right. I forgot to look at all the stuff that __remove_mapping does nowadays and remembered some very old code from vmscan.c instead. Acked-by: Rik van Riel <riel@redhat.com> -- All rights reversed --
This works for me, and I can code up a patch that uses it if you like... -- Trond Myklebust Linux NFS client maintainer NetApp Trond.Myklebust@netapp.com www.netapp.com --
This violates the "Really. Don't send patches as attachments." lkml-faq set forth by yourself ;) Am I the only one who still uses a mail reader that doesn't decode mime by default? Maybe it's time to move on... Thanks, Miklos --
Hmm... Looking again at the problem, it appears that the same callback needs to be added to truncate_complete_page() and invalidate_complete_page2(). Otherwise we end up in a situation where the page can sometimes be removed from the page cache without calling Cheers Trond -- Trond Myklebust Linux NFS client maintainer NetApp Trond.Myklebust@netapp.com www.netapp.com --
Yes, I was wondering quite how we would define this ->freepage thing, if it gets called from one place that removes from page cache and not from others. Another minor problem with it: it would probably need to take the struct address_space *mapping as arg as well as struct page *page: because by this time page->mapping has been reset to NULL. But I'm not at all keen on adding a calllback in this very special frozen-to-0-references place: please let's not do it without an okay from Nick Piggin (now Cc'ed). I agree completely with what Linus said originally about how the page cannot be freed while there's a reference to it, and it should be possible to work this without your additional page locks. Your ->releasepage should be able to judge whether the page is likely (not certain) to be freed - page_count 3? 1 for the page cache, 1 for the page_private reference, 1 for vmscan's reference, I think. Then it can mark !PageUptodate and proceed with freeing the stuff you had allocated, undo page_has_private and its reference, and return 1 (or return 0 if it decides to hold on to the page). If something races in and grabs another reference to prevent removal from page cache and freeing, then won't read_cache_page(), seeing !Uptodate, do the right thing and set up the required info again? Or perhaps I haven't looked far enough, and you do have races which actually need your page locks, I can see they make it easier to think about. But I'd prefer us not to throw in another callback if it's well workable with the ->releasepage we already have. (If it helps, perhaps we could adjust shrink_page_list() to allow for the page being removed from page cache inside try_to_release_page() - but I don't think that should be necessary.) Hugh --
The thing is, NFS already really uses releasepage for its "real"
designed usage, in the form of the fscache case (which the directory
inodes don't use).
I really hate mixing things up. The NFS directory case really hasn't
got anything to do with releasepage, and taking the page lock on the
read side is just really sad. As is marking the page not up-to-date,
just so that it will get filled again.
In fact, I don't even know if it's kosher by VFS standards to clear
the up-to-date bit in the first place after it has already gotten set.
It absolutely isn't for anything that can be mmap'ed. Obviously, mmap
doesn't work on the directory cache anyway, but the point is that the
work-arounds for the semantic gap are really quite ugly.
Certainly much uglier than just adding some much simpler semantics to
the "now I'm releasing the page" case in the VM.
Linus
--
That is very brittle. I'd prefer not to have to scan linux-mm every week in order to find out if someone changed the page_count. However, while reading Documentation/filesystems/vfs.txt (in order to add documentation for freepage) I was surprised to read that the ->releasepage() is itself supposed to be allowed to actually remove the page from the address space if it so desires. Looking at the actual code in shrink_page_list() and friends I can't see how that can possibly fail to break things, but if it were true, then that might enable us to call remove_mapping() in order to safely free the page before it gets cleared. Cheers Trond -- Trond Myklebust Linux NFS client maintainer NetApp Trond.Myklebust@netapp.com www.netapp.com --
On Wed, 01 Dec 2010 15:05:38 -0500 That doesn't sound right. It came from Neil in 2006. --
On Wed, 1 Dec 2010 12:39:29 -0800 Andrew Morton <akpm@linux-foundation.org> Nope, no idea, sorry. No releasepage functions do anything like that, and no call sites suggest it could be a possibility. Quite the reverse - they are likely to remove the page from the mapping without checking that it is still in the mapping. So that sentence should be deleted. (Getting good review for doco updates is sooo hard :-) NeilBrown --
On Thu, 2 Dec 2010 08:29:13 +1100
This?
From: Andrew Morton <akpm@linux-foundation.org>
->releasepage() does not remove the page from the mapping.
Cc: Neil Brown <neilb@suse.de>
Cc: Trond Myklebust <Trond.Myklebust@netapp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
Documentation/filesystems/vfs.txt | 9 ++++-----
1 file changed, 4 insertions(+), 5 deletions(-)
diff -puN Documentation/filesystems/vfs.txt~documentation-filesystems-vfstxt-fix-repeasepage-description Documentation/filesystems/vfs.txt
--- a/Documentation/filesystems/vfs.txt~documentation-filesystems-vfstxt-fix-repeasepage-description
+++ a/Documentation/filesystems/vfs.txt
@@ -660,11 +660,10 @@ struct address_space_operations {
releasepage: releasepage is called on PagePrivate pages to indicate
that the page should be freed if possible. ->releasepage
should remove any private data from the page and clear the
- PagePrivate flag. It may also remove the page from the
- address_space. If this fails for some reason, it may indicate
- failure with a 0 return value.
- This is used in two distinct though related cases. The first
- is when the VM finds a clean page with no active users and
+ PagePrivate flag. If releasepage() fails for some reason, it must
+ indicate failure with a 0 return value.
+ releasepage() is used in two distinct though related cases. The
+ first is when the VM finds a clean page with no active users and
wants to make it a free page. If ->releasepage succeeds, the
page will be removed from the address_space and become free.
_
--
On Wed, 1 Dec 2010 14:43:14 -0800 Andrew Morton <akpm@linux-foundation.org> Perfect, thanks. --
On Wed, Dec 1, 2010 at 10:54 AM, Trond Myklebust
Yes, I think any caller of __remove_from_page_cache() should do it
once it has dropped all locks.
And to be consistent with that rule, even in the __remove_mapping()
function I suspect the code to call ->freepage() might as well be done
only in the __remove_from_page_cache() case (ie not in the
PageSwapCache() case).
Then, add the case to the end of "remove_page_cache()" itself, and now
it's really easy to just grep for __remove_from_page_cache() and make
sure they all do it.
That sounds sane, no?
Linus
--
Something like the following then?
-----------------------------------------------------------------------------------------
From 3a46d5eab1ac6efe9dfaf873e23de7589e0acccc Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Wed, 1 Dec 2010 13:35:19 -0500
Subject: [PATCH] Call the filesystem back whenever a page is removed from the page cache
NFS needs to be able to release objects that are stored in the page
cache once the page itself is no longer visible from the page cache.
This patch adds a callback to the address space operations that allows
filesystems to perform page cleanups once the page has been removed
from the page cache.
Original patch by: Linus Torvalds <torvalds@linux-foundation.org>
[trondmy: cover the cases of invalidate_inode_pages2() and
truncate_inode_pages()]
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
Documentation/filesystems/Locking | 5 +++++
Documentation/filesystems/vfs.txt | 5 +++++
include/linux/fs.h | 1 +
mm/truncate.c | 8 ++++++++
mm/vmscan.c | 3 +++
5 files changed, 22 insertions(+), 0 deletions(-)
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index a91f308..06d6b71 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -173,6 +173,7 @@ prototypes:
sector_t (*bmap)(struct address_space *, sector_t);
int (*invalidatepage) (struct page *, unsigned long);
int (*releasepage) (struct page *, int);
+ void (*freepage)(struct page *);
int (*direct_IO)(int, struct kiocb *, const struct iovec *iov,
loff_t offset, unsigned long nr_segs);
int (*launder_page) (struct page *);
@@ -193,6 +194,7 @@ perform_write: no n/a yes
bmap: no
invalidatepage: no yes
releasepage: no yes
+freepage: no yes
direct_IO: no
launder_page: no yes
@@ -288,6 +290,9 @@ buffers from the page in preparation for freeing it. It ...On Wed, Dec 1, 2010 at 12:10 PM, Trond Myklebust
I'd do this in "remove_from_page_cache()" instead of in
"truncate_complete_page()". Otherwise it will miss any other callers
that use "remove_from_page_cache()" (not that there are many, and I
doubt NFS cares about the ones that do exists, but..)
Linus
--
Adding Nick Piggin back to the Cc. I do think it needs to be freepage(struct address_space *, struct page *), even if you happen not to need to know the mapping in the NFS case. --
On Wed, 01 Dec 2010 15:10:50 -0500 It would be good to think about and then clearly spell out exactly what state the page is in here. It is locked, and I assume clean and not under writeback. What about its refcount, freezedness status and eligibility for lookups? And as Hugh pointed out, some callees might needs the address_space* although we can perhaps defer that until such a callee turns up. If/when that happens we might have a problem though: if this locked page is no longer attached to the address_space then what now pins the address_space, protecting it from inode reclaim? --
That's an excellent point and trumps mine: it would be actively wrong to provide the struct address_space *mapping arg I was asking for. (Bet someone then tries stashing it away via page->private though.) Hugh --
Hmm, thinking further along the same lines: can we even guarantee that the filesystem module is still loaded at that point? i.e. might mapping->freepage now be pointing off into the garbage heap? Hugh --
On Wed, 1 Dec 2010 13:15:07 -0800 (PST) I don't see anything on the VFS side which would prevent a module unload. Or, more realistically, a concurrent unmount, freeing of the superblock and everything associated with it. All we have here is a page*. Probably on most call paths we'll be OK - if a process is in the middle of a file truncate, holdin a file* ref which holds an inode ref then nobody will be unmounting that fs and hence nobody will be unloading that module. However on the random_code->alloc_page->vmscan->releasepage path, none of that applies. --
Just out of interest, what ensures that the mapping is still around for the 'spin_unlock_irq(&mapping->tree_lock);' in __remove_mapping()? I'm clearly missing whatever mechanism prevents iput_final() from racing with vmscan if the latter clears out the last page from the mapping. -- Trond Myklebust Linux NFS client maintainer NetApp Trond.Myklebust@netapp.com www.netapp.com --
On Wed, 01 Dec 2010 16:51:12 -0500 Nothing, afacit. I think this was the race which I taunted the mm developers about a couple of months back (can't find the email) and nobody contradicted The mechanism is called "luck". Way back in the 2.5.late days there was such a bug in the kernel (inode was reclaimed while vmscan was playing with the address_space) and I was able to trigger oopses from it. It required really massive, withering amounts of memory pressure. Stupid amounts. I should dig out those tools and remember how to operate them... --
No, we're good.
Module unload has to go through a "stop_machine()" cycle, and that in
turn requires an idle period for everything. And just a preemption
reschedule isn't enough for that.
So what is sufficient is that
- we had the page locked and on the mapping
This implies that we had an inode reference to the module, and the
page lock means that the inode reference cannot go away (because it
will involve invalidate-pages etc)
- we're not sleeping after __remove_mapping, so unload can't happen afterwards.
A _lot_ of the module races depend on that latter thing. We have
almost no cases that are strictly about actual reference counts etc.
Linus
--
On Wed, 1 Dec 2010 14:24:42 -0800 OK, the stop_machine() plugs a lot of potential race-vs-module-unload things. But Trond is referring to races against vmscan inode reclaim, unmount, etc. --
Yes, that was my question. However, Linus' explanation appears to answer one of Hugh's objections: we can apparently use a preempt-disable() in order to ensure that the module containing the mapping->a_ops is not unloaded until after the ->freepage() call is complete. That would imply that ->freepage() cannot sleep, but I don't think that is too nasty a restriction. -- Trond Myklebust Linux NFS client maintainer NetApp Trond.Myklebust@netapp.com www.netapp.com --
Revised patch would be as follows...
----------------------------------------------------------------------------------------
From 1cd6bf106ef40dec415e375eb3632b9a84a72d5f Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Wed, 1 Dec 2010 13:35:19 -0500
Subject: [PATCH] Call the filesystem back whenever a page is removed from the page cache
NFS needs to be able to release objects that are stored in the page
cache once the page itself is no longer visible from the page cache.
This patch adds a callback to the address space operations that allows
filesystems to perform page cleanups once the page has been removed
from the page cache.
Original patch by: Linus Torvalds <torvalds@linux-foundation.org>
[trondmy: cover the cases of invalidate_inode_pages2() and
truncate_inode_pages()]
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
Documentation/filesystems/Locking | 7 ++++++-
Documentation/filesystems/vfs.txt | 7 +++++++
include/linux/fs.h | 1 +
mm/truncate.c | 8 ++++++++
mm/vmscan.c | 11 +++++++++++
5 files changed, 33 insertions(+), 1 deletions(-)
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index a91f308..b6426f1 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -173,12 +173,13 @@ prototypes:
sector_t (*bmap)(struct address_space *, sector_t);
int (*invalidatepage) (struct page *, unsigned long);
int (*releasepage) (struct page *, int);
+ void (*freepage)(struct page *);
int (*direct_IO)(int, struct kiocb *, const struct iovec *iov,
loff_t offset, unsigned long nr_segs);
int (*launder_page) (struct page *);
locking rules:
- All except set_page_dirty may block
+ All except set_page_dirty and freepage may block
BKL PageLocked(page) i_mutex
writepage: no yes, unlocks (see below)
@@ -193,6 +194,7 @@ perform_write: no n/a yes
...On Wed, 01 Dec 2010 18:21:16 -0500 So here we're assuming that `mapping' was pinned by other means. And here. --
Yes. Both these functions are static, and their callers are assuming that something is already pinning the underlying inode, so the above should be quite safe. -- Trond Myklebust Linux NFS client maintainer NetApp Trond.Myklebust@netapp.com www.netapp.com --
So?
A filesystem module cannot be unloaded while it's still mounted.
And unmount doesn't succeed until all inodes are gone.
And getting rid of an inode doesn't succeed until all pages associated
with it are gone.
And getting rid of the pages involves locking them (whether in
truncate or vmscan) and removing them from all lists.
Ergo: vmscan has a locked page leads to the filesystem being
guaranteed to not be unmounted. And that, in turn, guarantees that
the module won't be unloaded until the machine has gone through an
idle cycle.
It really is that simple. There's nothing subtle there. The reason
spin_unlock(&mapping->tree_lock) is safe is exactly the above trivial
chain of dependencies. And it's also exactly why
mapping->a_ops->freepage() would also be safe.
This is pretty much how all the module races are handled. Doing module
ref-counts per page (or per packet in flight for things like
networking) would be prohibitively expensive. There's no way we can
ever do that.
Linus
--
On Wed, 1 Dec 2010 15:31:11 -0800
The page isn't attached to the address_space any more:
static int __remove_mapping(struct address_space *mapping, struct page *page)
{
...
__remove_from_page_cache(page);
spin_unlock_irq(&mapping->tree_lock);
...
}
--
Did you even read the email?
Here, let me quote the important parts:
"module won't be unloaded until the machine has gone through an idle cycle"
"This is pretty much how all the module races are handled. Doing module
ref-counts per page (or per packet in flight for things like
networking) would be prohibitively expensive."
IOW, the whole "stop_machine()" part is fundamental. That whole
"module unload won't happen until we've gone through an idle cycle" is
EXACTLY why we don't need to have the page attached or ref-counted -
we're still safe.
Linus
--
On Wed, 1 Dec 2010 17:05:43 -0800 We stopped talking about module unload hours ago. That matter is settled. What we're talking about is races against memory reclaim, unmount, etc. --
Ahh. Those I can believe in. Although I think they'd almost
incidentally be fixed by making inode freeing (which is where the
'struct address_space' is embedded) RCU-safe, which we're going to do
anyway in 38. Then we could make the vmscan code just be a rcu-read
section.
Of course, I do think the race is basically impossible to hit in
practice regardless.
Linus
--
I didn't know that aspect of it. It will be nice to plug this race - it's been there for so long because nobody was able to think of an acceptable way of fixing it by direct means (synchronous locking, refcounting, etc). Taking a ref on the inode doesn't work, because we can't run iput_final() in direct-reclaim contexts (lock ordering snafus). vmscan is the problematic path - I _think_ all other code paths which remove pagecache have an inode ref. But this assumes that inode->i_mapping points at inode->i_data! Need to think about the situation where it points at a different inode's i_data - in that case Actually I was able to hit the race back in late 2.5 or thereabouts. Really massive memory pressure caused vmscan->icache_shrinker to free the inode/address_space while another CPU in vmscan was playing with the address_space. That was quite a debugging session ;) --
v2 fixes the following issues:
- Fix up the cookie usage in uncached_readdir()
- Changelog fixups for the second patch
- .releasepage should use kmap_atomic() so that it can be called
from all direct reclaim contexts.
- Ensure that .releasepage also clears Pg_uptodate
- Set/clear the Pg_private flag to ensure .releasepage gets called when
appropriate.
- Add a .invalidatepage to ensure truncate_inode_pages() works correctly
- Ensure that the anonymous page that is generated by uncached_readdir()
doesn't leak memory.
v3:
- add the freepage() address space operation.
- Dump the page locking
- rewrite patch 'Fix a memory leak in nfs_readdir' to work with the new
->freepage() callback.
Linus Torvalds (1):
Call the filesystem back whenever a page is removed from the page
cache
Trond Myklebust (2):
NFS: Ensure we use the correct cookie in nfs_readdir_xdr_filler
NFS: Fix a memory leak in nfs_readdir
Documentation/filesystems/Locking | 7 ++++++-
Documentation/filesystems/vfs.txt | 7 +++++++
fs/nfs/dir.c | 28 +++++++++++++++++-----------
fs/nfs/inode.c | 1 +
include/linux/fs.h | 1 +
include/linux/nfs_fs.h | 1 +
mm/truncate.c | 8 ++++++++
mm/vmscan.c | 11 +++++++++++
8 files changed, 52 insertions(+), 12 deletions(-)
--
1.7.3.2
--
We need to ensure that the entries in the nfs_cache_array get cleared
when the page is removed from the page cache. To do so, we use the
freepage address_space operation.
Change nfs_readdir_clear_array to use kmap_atomic(), so that the
function can be safely called from all contexts.
Finally, modify the cache_page_release helper to call
nfs_readdir_clear_array directly, when dealing with an anonymous
page from 'uncached_readdir'.
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
fs/nfs/dir.c | 18 +++++++++---------
fs/nfs/inode.c | 1 +
include/linux/nfs_fs.h | 1 +
3 files changed, 11 insertions(+), 9 deletions(-)
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index e03537f..d529e0e 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -57,7 +57,7 @@ static int nfs_rename(struct inode *, struct dentry *,
struct inode *, struct dentry *);
static int nfs_fsync_dir(struct file *, int);
static loff_t nfs_llseek_dir(struct file *, loff_t, int);
-static int nfs_readdir_clear_array(struct page*, gfp_t);
+static void nfs_readdir_clear_array(struct page*);
const struct file_operations nfs_dir_operations = {
.llseek = nfs_llseek_dir,
@@ -83,8 +83,8 @@ const struct inode_operations nfs_dir_inode_operations = {
.setattr = nfs_setattr,
};
-const struct address_space_operations nfs_dir_addr_space_ops = {
- .releasepage = nfs_readdir_clear_array,
+const struct address_space_operations nfs_dir_aops = {
+ .freepage = nfs_readdir_clear_array,
};
#ifdef CONFIG_NFS_V3
@@ -214,17 +214,15 @@ void nfs_readdir_release_array(struct page *page)
* we are freeing strings created by nfs_add_to_readdir_array()
*/
static
-int nfs_readdir_clear_array(struct page *page, gfp_t mask)
+void nfs_readdir_clear_array(struct page *page)
{
- struct nfs_cache_array *array = nfs_readdir_get_array(page);
+ struct nfs_cache_array *array;
int i;
- if (IS_ERR(array))
- return PTR_ERR(array);
+ array = kmap_atomic(page, ...From: Linus Torvalds <torvalds@linux-foundation.org>
NFS needs to be able to release objects that are stored in the page
cache once the page itself is no longer visible from the page cache.
This patch adds a callback to the address space operations that allows
filesystems to perform page cleanups once the page has been removed
from the page cache.
Original patch by: Linus Torvalds <torvalds@linux-foundation.org>
[trondmy: cover the cases of invalidate_inode_pages2() and
truncate_inode_pages()]
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
Documentation/filesystems/Locking | 7 ++++++-
Documentation/filesystems/vfs.txt | 7 +++++++
include/linux/fs.h | 1 +
mm/truncate.c | 8 ++++++++
mm/vmscan.c | 11 +++++++++++
5 files changed, 33 insertions(+), 1 deletions(-)
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index a91f308..b6426f1 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -173,12 +173,13 @@ prototypes:
sector_t (*bmap)(struct address_space *, sector_t);
int (*invalidatepage) (struct page *, unsigned long);
int (*releasepage) (struct page *, int);
+ void (*freepage)(struct page *);
int (*direct_IO)(int, struct kiocb *, const struct iovec *iov,
loff_t offset, unsigned long nr_segs);
int (*launder_page) (struct page *);
locking rules:
- All except set_page_dirty may block
+ All except set_page_dirty and freepage may block
BKL PageLocked(page) i_mutex
writepage: no yes, unlocks (see below)
@@ -193,6 +194,7 @@ perform_write: no n/a yes
bmap: no
invalidatepage: no yes
releasepage: no yes
+freepage: no yes
direct_IO: no
launder_page: no yes
@@ -288,6 +290,9 @@ buffers from the page in preparation for freeing it. It returns zero to
indicate that the buffers are (or may be) freeable. If ->releasepage is zero,
the kernel assumes that the fs ...I think Linus recommended that one be done in remove_from_page_cache() I took his "stop_machine()" explanation ("an idle period for everything. And just a preemption reschedule isn't enough for that") to imply that there's no need for your preempt_disable/preempt_enable there: they don't add anything to the module unload case, and they don't help the spin_unlock_irq issue (and you're already being rightly careful to note freepage in advance). But maybe I misunderstood. Hugh --
I'm fine with doing that as long as everyone is happy that there is no chance of races. I was a bit nervous given the discussion that ensued Again, I was being cautious (I freely admit to not having studied stop_machine()). If nobody disagrees with your interpretation, then I'm very happy to drop the preempt disable/enable crud. Cheers Trond -- Trond Myklebust Linux NFS client maintainer NetApp Trond.Myklebust@netapp.com www.netapp.com --
On Wed, Dec 1, 2010 at 7:53 PM, Trond Myklebust
Just remove it. It won't matter for the stop_machine case, and the
other case that Andrew pointed out (just unmap with memory pressure)
is an independent thing that doesn't do stop_machine anyway.
In the next merge window we'll see the whole RCU name lookup (let's
see if people can agree on it, or whether I'll just have to do an
executive decision and push it through even without consensus), which
then rcu-delays inode freeing, and that will fix it for real. When
that happens, doing a "rcu_read_lock()" in vmscan.c before the removal
of the page from the mapping, and then unlocking after the callback
will be a real fix, but it will require the rcu release to be that
real fix anyway, so doing something else now will just confuse things.
Linus
--
v2 fixes the following issues:
- Fix up the cookie usage in uncached_readdir()
- Changelog fixups for the second patch
- .releasepage should use kmap_atomic() so that it can be called
from all direct reclaim contexts.
- Ensure that .releasepage also clears Pg_uptodate
- Set/clear the Pg_private flag to ensure .releasepage gets called when
appropriate.
- Add a .invalidatepage to ensure truncate_inode_pages() works correctly
- Ensure that the anonymous page that is generated by uncached_readdir()
doesn't leak memory.
v3:
- add the freepage() address space operation.
- Dump the page locking
- rewrite patch 'Fix a memory leak in nfs_readdir' to work with the new
->freepage() callback.
v4:
- Remove the preempt disable/enable protection in __remove_mapping
- Add a freepage() call to remove_from_page_cache() instead of
open-coding it in truncate_complete_page().
Cheers
Trond
Linus Torvalds (1):
Call the filesystem back whenever a page is removed from the page
cache
Trond Myklebust (2):
NFS: Ensure we use the correct cookie in nfs_readdir_xdr_filler
NFS: Fix a memory leak in nfs_readdir
Documentation/filesystems/Locking | 7 ++++++-
Documentation/filesystems/vfs.txt | 7 +++++++
fs/nfs/dir.c | 28 +++++++++++++++++-----------
fs/nfs/inode.c | 1 +
include/linux/fs.h | 1 +
include/linux/nfs_fs.h | 1 +
mm/filemap.c | 5 +++++
mm/truncate.c | 4 ++++
mm/vmscan.c | 7 +++++++
9 files changed, 49 insertions(+), 12 deletions(-)
--
1.7.3.2
--
We need to ensure that the entries in the nfs_cache_array get cleared
when the page is removed from the page cache. To do so, we use the
freepage address_space operation.
Change nfs_readdir_clear_array to use kmap_atomic(), so that the
function can be safely called from all contexts.
Finally, modify the cache_page_release helper to call
nfs_readdir_clear_array directly, when dealing with an anonymous
page from 'uncached_readdir'.
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
fs/nfs/dir.c | 18 +++++++++---------
fs/nfs/inode.c | 1 +
include/linux/nfs_fs.h | 1 +
3 files changed, 11 insertions(+), 9 deletions(-)
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index e03537f..d529e0e 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -57,7 +57,7 @@ static int nfs_rename(struct inode *, struct dentry *,
struct inode *, struct dentry *);
static int nfs_fsync_dir(struct file *, int);
static loff_t nfs_llseek_dir(struct file *, loff_t, int);
-static int nfs_readdir_clear_array(struct page*, gfp_t);
+static void nfs_readdir_clear_array(struct page*);
const struct file_operations nfs_dir_operations = {
.llseek = nfs_llseek_dir,
@@ -83,8 +83,8 @@ const struct inode_operations nfs_dir_inode_operations = {
.setattr = nfs_setattr,
};
-const struct address_space_operations nfs_dir_addr_space_ops = {
- .releasepage = nfs_readdir_clear_array,
+const struct address_space_operations nfs_dir_aops = {
+ .freepage = nfs_readdir_clear_array,
};
#ifdef CONFIG_NFS_V3
@@ -214,17 +214,15 @@ void nfs_readdir_release_array(struct page *page)
* we are freeing strings created by nfs_add_to_readdir_array()
*/
static
-int nfs_readdir_clear_array(struct page *page, gfp_t mask)
+void nfs_readdir_clear_array(struct page *page)
{
- struct nfs_cache_array *array = nfs_readdir_get_array(page);
+ struct nfs_cache_array *array;
int i;
- if (IS_ERR(array))
- return PTR_ERR(array);
+ array = kmap_atomic(page, ...From: Linus Torvalds <torvalds@linux-foundation.org>
NFS needs to be able to release objects that are stored in the page
cache once the page itself is no longer visible from the page cache.
This patch adds a callback to the address space operations that allows
filesystems to perform page cleanups once the page has been removed
from the page cache.
Original patch by: Linus Torvalds <torvalds@linux-foundation.org>
[trondmy: cover the cases of invalidate_inode_pages2() and
truncate_inode_pages()]
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
Documentation/filesystems/Locking | 7 ++++++-
Documentation/filesystems/vfs.txt | 7 +++++++
include/linux/fs.h | 1 +
mm/filemap.c | 5 +++++
mm/truncate.c | 4 ++++
mm/vmscan.c | 7 +++++++
6 files changed, 30 insertions(+), 1 deletions(-)
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index a91f308..b6426f1 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -173,12 +173,13 @@ prototypes:
sector_t (*bmap)(struct address_space *, sector_t);
int (*invalidatepage) (struct page *, unsigned long);
int (*releasepage) (struct page *, int);
+ void (*freepage)(struct page *);
int (*direct_IO)(int, struct kiocb *, const struct iovec *iov,
loff_t offset, unsigned long nr_segs);
int (*launder_page) (struct page *);
locking rules:
- All except set_page_dirty may block
+ All except set_page_dirty and freepage may block
BKL PageLocked(page) i_mutex
writepage: no yes, unlocks (see below)
@@ -193,6 +194,7 @@ perform_write: no n/a yes
bmap: no
invalidatepage: no yes
releasepage: no yes
+freepage: no yes
direct_IO: no
launder_page: no yes
@@ -288,6 +290,9 @@ buffers from the page in preparation for freeing it. It returns zero to
indicate that the buffers are (or may be) freeable. If ->releasepage ...Of course we still have bugs in this regard, without inode RCU and filesystem deregistration RCU, but when those things are implemented for RCU path-walk, this section should be updated somewhat, and we'll have to look at RCU protecting the final mapping manipulations after a page is removed from pagecache. The generic parts of the code look OK to me, but why is there a difference in your sequences of loading the freepage function pointer here? --
We need to use the cookie from the previous array entry, not the
actual cookie that we are searching for (except for the case of
uncached_readdir).
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
fs/nfs/dir.c | 10 ++++++++--
1 files changed, 8 insertions(+), 2 deletions(-)
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index f0a384e..e03537f 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -178,6 +178,7 @@ typedef struct {
struct page *page;
unsigned long page_index;
u64 *dir_cookie;
+ u64 last_cookie;
loff_t current_index;
decode_dirent_t decode;
@@ -344,6 +345,8 @@ int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc)
else
status = nfs_readdir_search_for_cookie(array, desc);
+ if (status == -EAGAIN)
+ desc->last_cookie = array->last_cookie;
nfs_readdir_release_array(desc->page);
out:
return status;
@@ -563,7 +566,7 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
unsigned int array_size = ARRAY_SIZE(pages);
entry.prev_cookie = 0;
- entry.cookie = *desc->dir_cookie;
+ entry.cookie = desc->last_cookie;
entry.eof = 0;
entry.fh = nfs_alloc_fhandle();
entry.fattr = nfs_alloc_fattr();
@@ -672,8 +675,10 @@ int readdir_search_pagecache(nfs_readdir_descriptor_t *desc)
{
int res;
- if (desc->page_index == 0)
+ if (desc->page_index == 0) {
desc->current_index = 0;
+ desc->last_cookie = 0;
+ }
while (1) {
res = find_cache_page(desc);
if (res != -EAGAIN)
@@ -764,6 +769,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
}
desc->page_index = 0;
+ desc->last_cookie = *desc->dir_cookie;
desc->page = page;
status = nfs_readdir_xdr_to_array(desc, page, inode);
--
1.7.3.2
--
We need to use the cookie from the previous array entry, not the
actual cookie that we are searching for (except for the case of
uncached_readdir).
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
fs/nfs/dir.c | 10 ++++++++--
1 files changed, 8 insertions(+), 2 deletions(-)
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index f0a384e..e03537f 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -178,6 +178,7 @@ typedef struct {
struct page *page;
unsigned long page_index;
u64 *dir_cookie;
+ u64 last_cookie;
loff_t current_index;
decode_dirent_t decode;
@@ -344,6 +345,8 @@ int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc)
else
status = nfs_readdir_search_for_cookie(array, desc);
+ if (status == -EAGAIN)
+ desc->last_cookie = array->last_cookie;
nfs_readdir_release_array(desc->page);
out:
return status;
@@ -563,7 +566,7 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
unsigned int array_size = ARRAY_SIZE(pages);
entry.prev_cookie = 0;
- entry.cookie = *desc->dir_cookie;
+ entry.cookie = desc->last_cookie;
entry.eof = 0;
entry.fh = nfs_alloc_fhandle();
entry.fattr = nfs_alloc_fattr();
@@ -672,8 +675,10 @@ int readdir_search_pagecache(nfs_readdir_descriptor_t *desc)
{
int res;
- if (desc->page_index == 0)
+ if (desc->page_index == 0) {
desc->current_index = 0;
+ desc->last_cookie = 0;
+ }
while (1) {
res = find_cache_page(desc);
if (res != -EAGAIN)
@@ -764,6 +769,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
}
desc->page_index = 0;
+ desc->last_cookie = *desc->dir_cookie;
desc->page = page;
status = nfs_readdir_xdr_to_array(desc, page, inode);
--
1.7.3.2
--
Yeah, it's a tricky question. It would be solved if the inode reclaim code didn't have the nasty shortcuts for nr_pages == 0 sitting outisde the tree_lock... any time we have these kinds of optimisations checking So doing those checks under lock would be a reasonable way to fix it if anyone cares for 2.6.37 or earlier (eg. distros). But it is another lock in inode freeing path which is nice to avoid, so let's just get Yep, good observation. Thanks, Nick --
Although the page is locked, it may no longer be visible to the lockless page lookup once the radix_tree_delete() completes in __remove_from_page_cache. Furthermore, if the same routine causes mapping->nr_pages to go to zero before iput_final() hits truncate_inode_pages(), then the latter exits immediately. Both these cases would appear to allow iput_final() to release the inode before vmscan gets round to unlocking the mapping->tree_lock since truncate_inode_pages() no longer thinks it has any work to do. -- Trond Myklebust Linux NFS client maintainer NetApp Trond.Myklebust@netapp.com www.netapp.com --
I'm not so sure of that: doesn't it test inode->i_data.nrpages in various places, and skip ahead if that is already 0? I don't see Okay, I'm reassured on my module unload point; but not on the question of safety of spin_unlock_irq(&mapping->tree_lock) which Trond lobbed back in return. Hugh --
We need to use the cookie from the previous array entry, not the
actual cookie that we are searching for (except for the case of
uncached_readdir).
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
fs/nfs/dir.c | 10 ++++++++--
1 files changed, 8 insertions(+), 2 deletions(-)
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index f0a384e..e03537f 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -178,6 +178,7 @@ typedef struct {
struct page *page;
unsigned long page_index;
u64 *dir_cookie;
+ u64 last_cookie;
loff_t current_index;
decode_dirent_t decode;
@@ -344,6 +345,8 @@ int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc)
else
status = nfs_readdir_search_for_cookie(array, desc);
+ if (status == -EAGAIN)
+ desc->last_cookie = array->last_cookie;
nfs_readdir_release_array(desc->page);
out:
return status;
@@ -563,7 +566,7 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
unsigned int array_size = ARRAY_SIZE(pages);
entry.prev_cookie = 0;
- entry.cookie = *desc->dir_cookie;
+ entry.cookie = desc->last_cookie;
entry.eof = 0;
entry.fh = nfs_alloc_fhandle();
entry.fattr = nfs_alloc_fattr();
@@ -672,8 +675,10 @@ int readdir_search_pagecache(nfs_readdir_descriptor_t *desc)
{
int res;
- if (desc->page_index == 0)
+ if (desc->page_index == 0) {
desc->current_index = 0;
+ desc->last_cookie = 0;
+ }
while (1) {
res = find_cache_page(desc);
if (res != -EAGAIN)
@@ -764,6 +769,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
}
desc->page_index = 0;
+ desc->last_cookie = *desc->dir_cookie;
desc->page = page;
status = nfs_readdir_xdr_to_array(desc, page, inode);
--
1.7.3.2
--
