login
Header Space

 
 

Re: [PATCH 2/2] ptrace children revamp

Previous thread: [PATCH] dm9000 trivial annotation by Al Viro on Friday, March 28, 2008 - 11:11 pm. (1 message)

Next thread: PPC: 'xchg_u32' is static but used in inline function 'xchg_ptr' which is not static by Meelis Roos on Saturday, March 29, 2008 - 6:32 am. (1 message)
To: Andrew Morton <akpm@...>, Linus Torvalds <torvalds@...>
Cc: Oleg Nesterov <oleg@...>, <linux-kernel@...>
Date: Friday, March 28, 2008 - 11:34 pm

This breaks out the guts of do_wait into two subfunctions.
The control flow is less nonobvious without so much goto.
do_wait_thread contains the main work of the outer loop.
wait_consider_task contains the main work of the inner loop.

Signed-off-by: Roland McGrath &lt;roland@redhat.com&gt;
---
 kernel/exit.c |  191 +++++++++++++++++++++++++++++++++++----------------------
 1 files changed, 118 insertions(+), 73 deletions(-)

diff --git a/kernel/exit.c b/kernel/exit.c
index 53872bf..f2cf0a1 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1442,89 +1442,136 @@ static int wait_task_continued(struct task_struct *p, int noreap,
 	return retval;
 }
 
+/*
+ * Consider @p for a wait by @parent.
+ *
+ * -ECHILD should be in *@retval before the first call.
+ * Returns nonzero if we have unlocked tasklist_lock and have
+ * the final return value ready in *@retval.  Returns zero if
+ * the search for a child should continue; then *@retval is 0
+ * if @p is an eligible child, or still -ECHILD.
+ */
+static int wait_consider_task(struct task_struct *parent,
+			      struct task_struct *p, int *retval,
+			      enum pid_type type, struct pid *pid, int options,
+			      struct siginfo __user *infop,
+			      int __user *stat_addr, struct rusage __user *ru)
+{
+	int ret = eligible_child(type, pid, options, p);
+	if (!ret)
+		return 0;
+
+	if (unlikely(ret &lt; 0)) {
+		read_unlock(&amp;tasklist_lock);
+		*retval = ret;
+		return 1;
+	}
+
+	if (task_is_stopped_or_traced(p)) {
+		/*
+		 * It's stopped now, so it might later
+		 * continue, exit, or stop again.
+		 */
+		*retval = 0;
+		if ((p-&gt;ptrace &amp; PT_PTRACED) ||
+		    (options &amp; WUNTRACED)) {
+			*retval = wait_task_stopped(p, (options &amp; WNOWAIT),
+						    infop, stat_addr, ru);
+			if (*retval)
+				return 1;
+		}
+	} else if (p-&gt;exit_state == EXIT_ZOMBIE &amp;&amp; !delay_group_leader(p)) {
+		/*
+		 * We don't reap group leaders with subthreads.
+		 */
+		if (likely(options &amp; WEXITED...
To: Roland McGrath <roland@...>
Cc: Andrew Morton <akpm@...>, Oleg Nesterov <oleg@...>, <linux-kernel@...>
Date: Saturday, March 29, 2008 - 12:16 pm

I think it would be even more obvious (or, to use your phrase, "less 
nonobvious") if this was written like so:

	if (task_is_stopped_or_traced(p)) {
		...
		....
			if (*retval}
				return 1;
		}
		return 0;
	}

	if (p-&gt;exit_state == EXIT_ZOMBIE &amp;&amp; !delay_group_leader(p)) {
		...
		return 0;
	}

	if (...)

because then you can clearly see that smething like the
"task_is_stopped_or_traced(p)" case is complete in itself, and only has 
its own local stuff going on.

(And at some point I'd also almost make each case a trivial small inline 
function of its on, but in this case there are so many arguments to pass 
around that it probably becomes _less_ readable that way).

I also wonder if you really need both "int *retval" and the return value. 
Isn't "retval" always going to be zero or a negative errno? And the return 
value is going to be either true of false? Why not just fold them into one 
single thing:

 - negative: all done, with error
 - zero: this didn't trigger, continue with the next one in caller
 - positive: this thread triggered, all done, return 0 in the caller.

which is (I think) close to what we already do in eligible_child() (so 
this would not be a new calling convention for this particular code).

		Linus
--
To: Linus Torvalds <torvalds@...>, Andrew Morton <akpm@...>
Cc: Oleg Nesterov <oleg@...>, <linux-kernel@...>
Date: Sunday, March 30, 2008 - 11:57 pm

This breaks out the guts of do_wait into two subfunctions.
The control flow is less nonobvious without so much goto.
do_wait_thread contains the main work of the outer loop.
wait_consider_task contains the main work of the inner loop.

Signed-off-by: Roland McGrath &lt;roland@redhat.com&gt;
---
 kernel/exit.c |  194 +++++++++++++++++++++++++++++++++-----------------------
 1 files changed, 114 insertions(+), 80 deletions(-)

diff --git a/kernel/exit.c b/kernel/exit.c
index 53872bf..237e3d0 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1160,7 +1160,7 @@ static int wait_noreap_copyout(struct task_struct *p, pid_t pid, uid_t uid,
  * the lock and this task is uninteresting.  If we return nonzero, we have
  * released the lock and the system call should return.
  */
-static int wait_task_zombie(struct task_struct *p, int noreap,
+static int wait_task_zombie(struct task_struct *p, int options,
 			    struct siginfo __user *infop,
 			    int __user *stat_addr, struct rusage __user *ru)
 {
@@ -1168,7 +1168,10 @@ static int wait_task_zombie(struct task_struct *p, int noreap,
 	int retval, status, traced;
 	pid_t pid = task_pid_vnr(p);
 
-	if (unlikely(noreap)) {
+	if (!likely(options &amp; WEXITED))
+		return 0;
+
+	if (unlikely(options &amp; WNOWAIT)) {
 		uid_t uid = p-&gt;uid;
 		int exit_code = p-&gt;exit_code;
 		int why, status;
@@ -1320,13 +1323,16 @@ static int wait_task_zombie(struct task_struct *p, int noreap,
  * released the lock and the system call should return.
  */
 static int wait_task_stopped(struct task_struct *p,
-			     int noreap, struct siginfo __user *infop,
+			     int options, struct siginfo __user *infop,
 			     int __user *stat_addr, struct rusage __user *ru)
 {
 	int retval, exit_code, why;
 	uid_t uid = 0; /* unneeded, required by compiler */
 	pid_t pid;
 
+	if (!(p-&gt;ptrace &amp; PT_PTRACED) &amp;&amp; !(options &amp; WUNTRACED))
+		return 0;
+
 	exit_code = 0;
 	spin_lock_irq(&amp;p-&gt;sighand-&gt;siglock);
 
@@ -1344,7 +1...
To: Roland McGrath <roland@...>
Cc: Linus Torvalds <torvalds@...>, Andrew Morton <akpm@...>, <linux-kernel@...>
Date: Monday, March 31, 2008 - 4:51 am

(I apologize in advance if I missed something, I'm not able to study this series
 carefully now. But I hope that a false alarm is better than a probable bug.)


This looks wrong, we shouldn't clear *retval if the child is EXIT_DEAD.

We can race with another thread which changed EXIT_ZOMBIE-&gt;EXIT_DEAD but
hasn't released the child yet. Suppose we don't have other childs, in that
case do_wait() will falsely sleep on -&gt;wait_chldexit (unless WHOHANG).

Oleg.

--
To: Oleg Nesterov <oleg@...>
Cc: Linus Torvalds <torvalds@...>, Andrew Morton <akpm@...>, <linux-kernel@...>
Date: Monday, March 31, 2008 - 4:29 pm

&gt; This looks wrong, we shouldn't clear *retval if the child is EXIT_DEAD.

You're right.  Thanks for catching that.
I think it should look like this:

    {
	    int ret = eligible_child(type, pid, options, p);
	    if (ret &lt;= 0)
		    return ret;

	    if (p-&gt;exit_state == EXIT_DEAD)
		    return 0;

	    /*
	     * We don't reap group leaders with subthreads.
	     */
	    if (p-&gt;exit_state == EXIT_ZOMBIE &amp;&amp; !delay_group_leader(p))
		    return wait_task_zombie(p, options, infop, stat_addr, ru);

	    /*
	     * It's stopped or running now, so it might
	     * later continue, exit, or stop again.
	     */
	    *retval = 0;

	    if (task_is_stopped_or_traced(p))
		    return wait_task_stopped(p, options, infop, stat_addr, ru);

	    return wait_task_continued(p, options, infop, stat_addr, ru);
    }

What do you think?


Thanks,
Roland
--
To: Roland McGrath <roland@...>
Cc: Linus Torvalds <torvalds@...>, Andrew Morton <akpm@...>, <linux-kernel@...>
Date: Monday, March 31, 2008 - 4:07 pm

I think you are right.

(BTW, you moved the "options &amp; WHATEVER" checks into wait_task_xxx(),
 I think this really makes the code more readable and maintainable).

Oleg.

--
To: Linus Torvalds <torvalds@...>, Andrew Morton <akpm@...>
Cc: Oleg Nesterov <oleg@...>, <linux-kernel@...>
Date: Sunday, March 30, 2008 - 11:59 pm

This reverts the effect of commit f2cc3eb133baa2e9dc8efd40f417106b2ee520f3
"do_wait: fix security checks".  That change reverted the effect of commit
73243284463a761e04d69d22c7516b2be7de096c.  The rationale for the original
commit still stands.  The inconsistent treatment of children hidden by
ptrace was an unintended omission in the original change and in no way
invalidates its purpose.

This makes do_wait return the error returned by security_task_wait()
(usually -EACCES) in place of -ECHILD when there are some children the
caller would be able to wait for if not for the permission failure.  A
permission error will give the user a clue to look for security policy
problems, rather than for mysterious wait bugs.

Signed-off-by: Roland McGrath &lt;roland@redhat.com&gt;
---
 kernel/exit.c |   31 +++++++++++++++++++++----------
 1 files changed, 21 insertions(+), 10 deletions(-)

diff --git a/kernel/exit.c b/kernel/exit.c
index b5057ce..2f0392d 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1116,14 +1116,10 @@ static int eligible_child(enum pid_type type, struct pid *pid, int options,
 		return 0;
 
 	err = security_task_wait(p);
-	if (likely(!err))
-		return 1;
+	if (err)
+		return err;
 
-	if (type != PIDTYPE_PID)
-		return 0;
-	/* This child was explicitly requested, abort */
-	read_unlock(&amp;tasklist_lock);
-	return err;
+	return 1;
 }
 
 static int wait_noreap_copyout(struct task_struct *p, pid_t pid, uid_t uid,
@@ -1454,7 +1450,8 @@ static int wait_task_continued(struct task_struct *p, int options,
  * -ECHILD should be in *@retval before the first call.
  * Returns nonzero for a final return, when we have unlocked tasklist_lock.
  * Returns zero if the search for a child should continue; then *@retval is
- * 0 if @p is an eligible child, or still -ECHILD.
+ * 0 if @p is an eligible child, or another error from security_task_wait(),
+ * or still -ECHILD.
  */
 static int wait_consider_task(struct task_struct *parent,
 			      struct task_struct *p, int *r...
To: Roland McGrath <roland@...>
Cc: Linus Torvalds <torvalds@...>, Andrew Morton <akpm@...>, <linux-kernel@...>
Date: Monday, March 31, 2008 - 7:03 am

OK, it turns out I misunderstood the purpose of 73243284463a761e04d69d22c7516b2be7de096c,
its changelog says:

	wait* syscalls return -ECHILD even when an individual PID of a live child
	was requested explicitly, when security_task_wait denies the operation.
	This means that something like a broken SELinux policy can produce an
	unexpected failure that looks just like a bug with wait or ptrace or
	something.

I wrongly thought that "-ECHILD even when an individual PID ... was requested"

OK, thanks. Again, I was confused and thought we should "hide" -EACCES

Not that I blame this patch...

Suppose that we have 2 childs. The first one is running, the second is zombie
but nacked by security_task_wait(). Now waitpid(-1, WHOHANG|WEXITED) returns 0,
a bit strange/confusing.

Yes, we have the same behaviour before this patch, but after reading your
explanation I am starting to think this is not "optimal".

Don't get me wrong, I don't claim this should be changed, just I'd like to be
sure this didn't escape your attention.

Oleg.

--
To: Oleg Nesterov <oleg@...>
Cc: Linus Torvalds <torvalds@...>, Andrew Morton <akpm@...>, <linux-kernel@...>
Date: Monday, March 31, 2008 - 4:20 pm

What else would it do?  There is a child that will be reapable eventually
(the running one).  It might be even more strange and more confusing if you
hid the running child just because there is a permission-denied one.  If
you ask specifically for the permission-denied zombie you'll get the
permission error.  I'm not sure we can do better.


Thanks,
Roland


--
To: Linus Torvalds <torvalds@...>, Andrew Morton <akpm@...>
Cc: Oleg Nesterov <oleg@...>, <linux-kernel@...>
Date: Sunday, March 30, 2008 - 11:59 pm

ptrace no longer fiddles with the children/sibling links, and the
old ptrace_children list is gone.  Now using PTRACE_ATTACH on
someone else's child just uses the new ptrace_attach list instead.

There should be no user-visible difference that matters.  The only
change is the order in which do_wait() sees multiple stopped children
and stopped ptrace attachees.  Since wait_task_stopped() was changed
earlier so it no longer reorders the children list, we already know
this won't cause any new problems.

Signed-off-by: Roland McGrath &lt;roland@redhat.com&gt;
---
 include/linux/init_task.h |    2 +-
 include/linux/sched.h     |   27 +++----
 kernel/exit.c             |  183 ++++++++++++++++++++++-----------------------
 kernel/fork.c             |    4 +-
 kernel/ptrace.c           |   18 ++---
 5 files changed, 112 insertions(+), 122 deletions(-)

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 1f74e1d..70dbb73 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -157,7 +157,7 @@ extern struct group_info init_groups;
 		.nr_cpus_allowed = NR_CPUS,				\
 	},								\
 	.tasks		= LIST_HEAD_INIT(tsk.tasks),			\
-	.ptrace_children= LIST_HEAD_INIT(tsk.ptrace_children),		\
+	.ptrace_attach	= LIST_HEAD_INIT(tsk.ptrace_attach),		\
 	.ptrace_list	= LIST_HEAD_INIT(tsk.ptrace_list),		\
 	.real_parent	= &amp;tsk,						\
 	.parent		= &amp;tsk,						\
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6c275e8..5a755a0 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1043,12 +1043,6 @@ struct task_struct {
 #endif
 
 	struct list_head tasks;
-	/*
-	 * ptrace_list/ptrace_children forms the list of my children
-	 * that were stolen by a ptracer.
-	 */
-	struct list_head ptrace_children;
-	struct list_head ptrace_list;
 
 	struct mm_struct *mm, *active_mm;
 
@@ -1070,18 +1064,26 @@ struct task_struct {
 	/* 
 	 * pointers to (original) parent process, youngest child, younger sibling,
 	 * older sibling, re...
To: Roland McGrath <roland@...>
Cc: Linus Torvalds <torvalds@...>, Andrew Morton <akpm@...>, <linux-kernel@...>
Date: Monday, March 31, 2008 - 5:12 am

^^^^^^^^^^^^^^^^^^^^^^^
I still think this is wrong. Please also look at my previous reply,

Looks wrong... Above this "p-&gt;parent != parent" check we have

	if (p-&gt;exit_state == EXIT_ZOMBIE &amp;&amp; !delay_group_leader(p))
		return wait_task_zombie(p, options, infop, stat_addr, ru);

Suppose that p is not a group leader, it is traced by some unrelated task,
and now it is EXIT_ZOMBIE. We shouldn't release it (and return the "bogus"
pid to user-space).

Oleg.

--
To: Linus Torvalds <torvalds@...>
Cc: Andrew Morton <akpm@...>, Oleg Nesterov <oleg@...>, <linux-kernel@...>
Date: Sunday, March 30, 2008 - 11:27 pm

You listed the three possibilities for eligible_child().
For wait_consider_task(), there are four possibilities:

 - all done, with error
 - this thread was not eligible, look for another; return -ECHILD if none ready
 - this thread was eligible but is not ready; return 0 or block if none ready
 - all done, this thread is ready; return its pid

I'll post another version that I think you'll like a little better.


Thanks,
Roland
--
To: Roland McGrath <roland@...>
Cc: Andrew Morton <akpm@...>, Linus Torvalds <torvalds@...>, <linux-kernel@...>
Date: Saturday, March 29, 2008 - 6:35 am

Please note that eligible_child() drops tasklist_lock if it returns error

This is not right. If ret &lt; 0, we set *retval = ret, but then return 0.
We should return 1, so that the caller (do_wait) will report the error.

Note again that eligible_child() has already dropped tasklist, so not
only we don't return the error, we continue to run without tasklist

If !retval and WNOHANG, we should return -ECHILD, but with this patch
we return 0.

Perhaps I missed something, but personally I don't like the usage of
"int *retval", imho this really complicates the code. I think it is better
to use the returned values directly, but pass "&amp;flag" to do_wait_thread().
We only need the pointer to avoid the unnecessary scanning of -&gt;ptrace_children.
Better yet, we can split do_wait_thread() into 2 functions, and do not pass
the pointer at all. But I didn't read the next patch yet (will do tomorrow),
perhaps I missed the point of this approach.

Oleg.

--
To: Oleg Nesterov <oleg@...>
Cc: Andrew Morton <akpm@...>, Linus Torvalds <torvalds@...>, <linux-kernel@...>
Date: Sunday, March 30, 2008 - 11:54 pm

Oops!  Thanks for catching that.  I first got the code into shape before I
noticed your change:

commit f2cc3eb133baa2e9dc8efd40f417106b2ee520f3 "do_wait: fix security checks"

I let that one slip off the end of my queue and never gave it proper review.
I tweaked my patch hastily to keep in line with that change and was careless.

I would have objected to that change at the time had I been up to speed.
AFAICT your objection was based solely on the inconsistent treatment of
children PTRACE_ATTACH'd by another.  That was an inadvertent omission in
my original change in commit 73243284463a761e04d69d22c7516b2be7de096c (and
seems obviously so to me).  That bug in no way invalidates the motivation
for the original change.  But you threw the baby out with the bathwater.

The original change came up in response to an actual problem in the real
world.  There was a bug in SELinux and/or a particular security policy that
made it impossible to debug some daemons with gdb.  The bug was an
internally inconsistent set of security checks, so root running gdb was
allowed to ptrace the daemon process, that process was allowed to send gdb
a SIGCHLD and wake it up, but gdb was not allowed to see it with wait4().

The symptom of this bug was that gdb said "wait: No children".  
The resaonable developer seeing this double-checked by using strace on
gdb, and saw that indeed wait4(-1, ...) had failed with ECHILD even
though ps and /proc/PID/status and everything the developer knew said
wait4() should consider the daemon process a waitable child of gdb
(which had used PTRACE_ATTACH).  The reasonable developer says, "This
is strange bug in ptrace or wait, and I'm just stuck; send it to Roland."

So I debugged the whole thing and fixed an SELinux bug for the SELinux
people.  Then I imagined how it could have been:

The symptom of this bug was that gdb said "wait: Permission denied".
The resaonable developer seeing this double-checked by using strace on
gdb, and saw that indeed wait4(-1, ...) had failed ...
To: Andrew Morton <akpm@...>, Linus Torvalds <torvalds@...>
Cc: Oleg Nesterov <oleg@...>, <linux-kernel@...>
Date: Friday, March 28, 2008 - 11:35 pm

ptrace no longer fiddles with the children/sibling links, and the
old ptrace_children list is gone.  Now using PTRACE_ATTACH on
someone else's child just uses the new ptrace_attach list instead.

There should be no user-visible difference that matters.  The only
change is the order in which do_wait() sees multiple stopped children
and stopped ptrace attachees.  Since wait_task_stopped() was changed
earlier so it no longer reorders the children list, we already know
this won't cause any new problems.

Signed-off-by: Roland McGrath &lt;roland@redhat.com&gt;
---
 include/linux/init_task.h |    2 +-
 include/linux/sched.h     |   27 +++----
 kernel/exit.c             |  180 ++++++++++++++++++++++-----------------------
 kernel/fork.c             |    4 +-
 kernel/ptrace.c           |   18 ++---
 5 files changed, 111 insertions(+), 120 deletions(-)

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 1f74e1d..70dbb73 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -157,7 +157,7 @@ extern struct group_info init_groups;
 		.nr_cpus_allowed = NR_CPUS,				\
 	},								\
 	.tasks		= LIST_HEAD_INIT(tsk.tasks),			\
-	.ptrace_children= LIST_HEAD_INIT(tsk.ptrace_children),		\
+	.ptrace_attach	= LIST_HEAD_INIT(tsk.ptrace_attach),		\
 	.ptrace_list	= LIST_HEAD_INIT(tsk.ptrace_list),		\
 	.real_parent	= &amp;tsk,						\
 	.parent		= &amp;tsk,						\
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6c275e8..5a755a0 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1043,12 +1043,6 @@ struct task_struct {
 #endif
 
 	struct list_head tasks;
-	/*
-	 * ptrace_list/ptrace_children forms the list of my children
-	 * that were stolen by a ptracer.
-	 */
-	struct list_head ptrace_children;
-	struct list_head ptrace_list;
 
 	struct mm_struct *mm, *active_mm;
 
@@ -1070,18 +1064,26 @@ struct task_struct {
 	/* 
 	 * pointers to (original) parent process, youngest child, younger sibling,
 	 * older sibling, re...
To: Roland McGrath <roland@...>
Cc: Andrew Morton <akpm@...>, Linus Torvalds <torvalds@...>, <linux-kernel@...>
Date: Saturday, March 29, 2008 - 6:39 am

I didn't read the patch yet (will do), just a minor nit right now,


Afaics, this adds a minor pessimization. We shouldn't scan -&gt;ptrace_attach
list if it was already found that another thread has a "hidden" task.

Oleg.

--
To: Roland McGrath <roland@...>
Cc: Andrew Morton <akpm@...>, Linus Torvalds <torvalds@...>, <linux-kernel@...>
Date: Saturday, March 29, 2008 - 9:10 am

Please ignore. I confused -&gt;ptrace_attach with -&gt;ptrace_children which
doesn't exists any longer. The added pessimization is very minor.


I personally think this is very nice change, it really makes the code better.


FYI, arch/mips/kernel/irixsig.c uses add_parent/remove_parent. I don't know

Unless I missed something again, this is not 100% right.

Suppose that we are -&gt;real_parent, the child was traced by us, we ignore
SIGCHLD, and we are doing the threaded reparenting. In that case we should
release the child if it is dead, like the current code does.

Even if we don't ignore SIGCHLD, we should notify our thread-group, but
reparent_thread() skips do_notify_parent() when the new parent is from
is from the same thread-group.

Note also that reparent_thread() has a very old bug. When it sees the
EXIT_ZOMBIE task, it notifies the new parent, but doesn't check if the task
becomes detached because the new parent ignores SIGCHLD. Currently this means
that init must have a handler for SIGCHLD or we leak zombies.

(btw, this patch has many conflicts with Linus's or Andrew's tree)

Oleg.

--
To: Oleg Nesterov <oleg@...>
Cc: Andrew Morton <akpm@...>, Linus Torvalds <torvalds@...>, <linux-kernel@...>
Date: Friday, April 4, 2008 - 5:00 pm

I took it to be referring to what replaced the field that no longer exists.

That badly needs to be cleaned up.  As per recent comments from Linus in
another thread, I think we are best off letting this old cruft get broken

I tried to construct a test case to demonstrate this behavior on the
existing kernel.  I couldn't manage to get it to do this.  Are you this
is "like the current code does"?  I'll send you my test case attempts
off-list and we can try together to get the right test to show this.  If
it turns out that it does not really behave this way now and so my code
is not introducing any regression, then (as usual) I'd rather fix this

Why do you think this is wrong?  The notification is always group-wide
(a SIGCHLD to the group, and wakeup of the group-shared wait_chldexit).

I had noticed that.  After the cleanup, we can look into fixing that.
As usual, I am first going for a set of cleanup patches that changes


This was an inadvertent change from some sloppy last-minute rebasing.
Sorry about that.  Both of these two errors were not there in the
version of the code I read over a lot of times, and I flubbed them in
when rejiggering the patches after some of the earlier feedback.  
In the correct code, the ptrace-stolen check is the very first thing
after calling eligible_child and checking its return value.

When we have resolved the issue about the test case above, I will post
a new series that fixes these flubs.


Thanks,
Roland
--
To: Roland McGrath <roland@...>
Cc: Andrew Morton <akpm@...>, Linus Torvalds <torvalds@...>, <linux-kernel@...>
Date: Saturday, April 5, 2008 - 10:06 am

Heh. You are right, the current kernel has the same (minor) bug.
I beleive it was introduced by b2b2cbc4b2a2f389442549399a993a8306420baf.
Before this patch we notified the new parent even if it is from the same
thread group. If SIGCHLD is ignored, do_notify_parent() sets -&gt;exit_signal = -1,
and then forget_original_parent() notices this and adds the child to ptrace_dead.
Now we don't notify the parent, and so the child "hangs".


However, now I think your patch adds a more serious problem, we can really
leak a zombie. Suppose that that we are -&gt;real_parent, the child was traced
by us, it is zombie now, and the child is not a group leader. No matter who
is the new parent, no matter what is the state of SIGCHLD, we must not reparent
the child: nobody can release it except us. It is not traced any longer,
its -&gt;exit_signal == -1, eligible_child() doesn't like this.


Well, I was thinking about another thread (the new parent) sleeping in
do_wait(__WNOTHREAD)... not sure this really matters, though.



Yes, yes, I agree. I just wanted to be sure you didn't miss this bug.

Oleg.

--
To: Oleg Nesterov <oleg@...>
Cc: Andrew Morton <akpm@...>, Linus Torvalds <torvalds@...>, <linux-kernel@...>
Date: Wednesday, April 9, 2008 - 4:15 pm

I have a regression test program for that (the one I sent you).

(Yes!)  Ah, thanks for that.  I wrote a regression test program for this
case that worked before and fails with my last version of the patch.

I'm pretty sure it doesn't matter in real life.  I doubt whoever uses
__WNOTHREAD was relying on seeing some other thread's reparented
children.  TBH, I don't care much about __WNOTHREAD and I don't know off
hand of anything that actually uses it.  OTOH, an extra wakeup on
wait_chldexit is always harmless.  And for pure anality, it seems proper
to maintain the invariant that do_wait() wakes up promptly whenever
interrupting and restarting it would complete without blocking.  But, we
don't get that wakeup now and noone has complained, so all in all, I'm
inclined to leave it as it is.

I've put the latest tweaked version of this same patch series at:
	http://people.redhat.com/roland/kernel-patches/ptrace-cleanup.mbox
That adds a fourth patch that fixes the aforementioned bug case that the
current canonical kernel gets wrong.  I think that fix also incidentally
covered the init-ignores-SIGCHLD case, but I didn't test that and I'm
not really positive.

I'm working on a variant of the ptrace revamp where all ptrace'd tasks
go on a list (whether natural children or not).  (This was my original
intent, but then I thought it might be more complication and change that
way.  Now it's seeming attractive again.)  I tend towards this approach
aesthetically because it makes ptrace bookkeeping more consistent across
all kinds of tracees.  For the cases we've been talking about, it means
ptrace_exit() would take care of zombies kept alive by ptrace uniformly
before we get to the reparent_thread loop.  This means the init case is
separate and needs a separate fix, but that kind of seems cleaner.  When
I get the variant patch working, we can see which one looks best.  I'd
like your opinion on that.


Thanks,
Roland
--
To: Roland McGrath <roland@...>
Cc: Andrew Morton <akpm@...>, Linus Torvalds <torvalds@...>, <linux-kernel@...>
Date: Sunday, April 13, 2008 - 10:24 am

Sorry for delay!


Agreed. I never understood why we need __WNOTHREAD. The same for
-&gt;pdeath_signal in its current "per-thread" form. I think it would be
very nice to kill them both (or send the -&gt;pdeath_signal when the whole
process exits). Then we can place all childrens on one signal-&gt;children

I think the 4th patch has a small problem,

	reparent_zombie:

		if (p-&gt;exit_signal == -1 ||
		    (thread_group_empty(p) &amp;&amp; ignoring_children(p-&gt;real_parent)))
			list_add(&amp;p-&gt;ptrace_list, dead);

The 2nd case, "thread_group_empty(p) &amp;&amp; ignoring_children", looks racy.
We didn't set -&gt;exit_signal == -1, the new parent can call do_wait() and

Yes! I thought about this too. Actually, I was very sure that this is your
plan from the the very beginning ;)

Oleg.

--
To: Oleg Nesterov <oleg@...>
Cc: Andrew Morton <akpm@...>, Linus Torvalds <torvalds@...>, <linux-kernel@...>
Date: Monday, April 14, 2008 - 9:41 pm

&gt; Sorry for delay!


I'm in complete agreement here, but indeed it's to be considered later.
I never understood pdeath_signal, but I recall someone piping up before
and saying it really was used with its current semantics, so go figure.
(Btw, we can move -&gt;children and still keep __WNOTHREAD compatibility.
It just has to check p-&gt;parent == current for __WNOTHREAD.  We can do
that if we determine that __WNOTHREAD is purely for anal backward
compatibility, and noone cares about the performance difference of

Thanks.  I think we're moving on to the other variant of the patch now,

You were quite right!  I somehow tricked myself into trying something
inferior first.  Silly me.  :-)


Thanks,
Roland
--
To: Roland McGrath <roland@...>
Cc: Andrew Morton <akpm@...>, Linus Torvalds <torvalds@...>, <linux-kernel@...>
Date: Saturday, March 29, 2008 - 10:37 am

Also, I think ptrace_exit() is not right,

	if (p-&gt;exit_signal != -1 &amp;&amp; !thread_group_empty(p))
		do_notify_parent(p, p-&gt;exit_signal);

note the "!thread_group_empty()" above, I guess this is typo, thread group
must be empty if we are going to release the task or notify its parent.


IOW, perhaps something like the patch below makes sense.

Oleg.

--- x/kernel/exit.c~x	2008-03-29 17:14:54.000000000 +0300
+++ x/kernel/exit.c	2008-03-29 17:28:17.000000000 +0300
@@ -596,6 +596,16 @@ static void exit_mm(struct task_struct *
 	mmput(mm);
 }
 
+static void xxx(struct task_struct *p, struct list_head *dead)
+{
+	if (p-&gt;exit_state == EXIT_ZOMBIE) {
+		if (p-&gt;exit_signal != -1 &amp;&amp; thread_group_empty(p))
+			do_notify_parent(p, p-&gt;exit_signal);
+		if (p-&gt;exit_signal == -1)
+			list_add(&amp;p-&gt;ptrace_list, dead);
+	}
+}
+
 /*
  * Detach any ptrace attachees (not our own natural children).
  * Any that need to be release_task'd are put on the @dead list.
@@ -616,12 +626,7 @@ static void ptrace_exit(struct task_stru
 		 * reap itself, add it to the @dead list.  We can't call
 		 * release_task() here because we already hold tasklist_lock.
 		 */
-		if (p-&gt;exit_state == EXIT_ZOMBIE) {
-			if (p-&gt;exit_signal != -1 &amp;&amp; !thread_group_empty(p))
-				do_notify_parent(p, p-&gt;exit_signal);
-			if (p-&gt;exit_signal == -1)
-				list_add(&amp;p-&gt;ptrace_list, dead);
-		}
+		 xxx(p, dead);
 	}
 }
 
@@ -661,13 +666,6 @@ static void reparent_thread(struct task_
 	if (p-&gt;exit_signal != -1)
 		p-&gt;exit_signal = SIGCHLD;
 
-	/* If we'd notified the old parent about this child's death,
-	 * also notify the new parent.
-	 */
-	if (p-&gt;exit_state == EXIT_ZOMBIE &amp;&amp;
-	    p-&gt;exit_signal != -1 &amp;&amp; thread_group_empty(p))
-		do_notify_parent(p, p-&gt;exit_signal);
-
 	/*
 	 * process group orphan check
 	 * Case ii: Our child is in a different pgrp
@@ -720,6 +718,7 @@ static void forget_original_parent(struc
 ...
Previous thread: [PATCH] dm9000 trivial annotation by Al Viro on Friday, March 28, 2008 - 11:11 pm. (1 message)

Next thread: PPC: 'xchg_u32' is static but used in inline function 'xchg_ptr' which is not static by Meelis Roos on Saturday, March 29, 2008 - 6:32 am. (1 message)
speck-geostationary