[PATCH] Immediate Values x86 Optimization Declare Discarded Instruction

!MAILaRCHIVE_VOTE_RePLACE
Previous message: [thread] [date] [author]
Next message: [thread] [date] [author]
To: H. Peter Anvin <hpa@...>
Cc: <akpm@...>, <linux-kernel@...>, Andi Kleen <ak@...>, Chuck Ebbert <cebbert@...>, Christoph Hellwig <hch@...>, Jeremy Fitzhardinge <jeremy@...>, Thomas Gleixner <tglx@...>, Ingo Molnar <mingo@...>
Date: Wednesday, November 14, 2007 - 2:52 pm

(the patch follows at the end of the discussion, it applies on top of
the "Immediate Values x86 Optimization (update 2)" patch.

* H. Peter Anvin (hpa@zytor.com) wrote:

nitpicking :


could we simply declare a 

     .section __discard,"",@progbits instead ?

I think this should remove the need to tweak the linker script, since
the section would not be allocated.


So in general we would have :

    .org . + ((-.-(2b-1b)) & (immediate_size - 1)), 0x90

Here is the result, on i386, for an immediate value used by markers. I
normally use a 1 byte immediate value for this, but I tried to change
the data type to check the result. The resulting assembly and binary
generated on x86_64 follows.

* A 1 byte immediate value (no need to align) :

#APP
        .section __immediate,"a",@progbits
         .long __mark_kernel_sched_try_wakeup.30104+8, (3f)-1, 1
        .previous
        mov $0,%al
        3:

.LVL1116:
#NO_APP
.LBE2180:
        testb   %al, %al
        jne     .L1028


* A 2 bytes immediate value :

#APP
	.section __discard,"",@progbits
	1:
	mov $0,%ax
	2:
	.previous
	.section __immediate,"a",@progbits
	 .long __mark_kernel_sched_try_wakeup.30104+8, (3f)-2, 2
	.previous
	.org . + ((-.-(2b-1b)) & (2-1)), 0x90
	mov $0,%ax
	3:
	
.LVL1116:
#NO_APP
.LBE2180:
	testw	%ax, %ax
	jne	.L1028


* A 4 bytes immediate value :

#APP
        .section __discard,"",@progbits
        1:
        mov $0,%eax
        2:
        .previous
        .section __immediate,"a",@progbits
         .long __mark_kernel_sched_try_wakeup.30104+8, (3f)-4, 4
        .previous
        .org . + ((-.-(2b-1b)) & (4-1)), 0x90
        mov $0,%eax
        3:
        
.LVL1116:
#NO_APP
.LBE2180:
        testl   %eax, %eax
        jne     .L1028


And on x86_64 :

* A 1 byte immediate value :

#APP
        .section __immediate,"a",@progbits
         .quad __mark_kernel_sched_try_wakeup.31114+16, (3f)-1, 1
        .previous
        mov $0,%al
        3:

.LVL705:
#NO_APP
.LBE2003:
        testb   %al, %al
        je      .L676

Resulting binary :

    28c0:       b0 00                   mov    $0x0,%al
    28c2:       84 c0                   test   %al,%al
    28c4:       74 21                   je     28e7 <try_to_wake_up+0x50>


* A 2 bytes immediate value :
#APP
        .section __discard,"",@progbits
        1:
        mov $0,%ax
        2:
        .previous
        .section __immediate,"a",@progbits
         .quad __mark_kernel_sched_try_wakeup.31114+16, (3f)-2, 2
        .previous
        .org . + ((-.-(2b-1b)) & (2-1)), 0x90
        mov $0,%ax
        3:

.LVL705:
#NO_APP
.LBE2003:
        testw   %ax, %ax
        je      .L676

Resulting binary :

    28c0:       66 b8 00 00             mov    $0x0,%ax
    28c4:       66 85 c0                test   %ax,%ax
    28c7:       74 21                   je     28ea <try_to_wake_up+0x53>


* A 4 bytes immediate value :

#APP
        .section __discard,"",@progbits
        1:
        mov $0,%eax
        2:
        .previous
        .section __immediate,"a",@progbits
         .quad __mark_kernel_sched_try_wakeup.31114+16, (3f)-4, 4
        .previous
        .org . + ((-.-(2b-1b)) & (4-1)), 0x90
        mov $0,%eax
        3:

.LVL705:
#NO_APP
.LBE2003:
        testl   %eax, %eax
        je      .L676

Resulting binary :

    28c0:       90                      nop    
    28c1:       90                      nop    
    28c2:       90                      nop    
    28c3:       b8 00 00 00 00          mov    $0x0,%eax
    28c8:       85 c0                   test   %eax,%eax


* An 8 bytes immediate value :

#APP
        .section __discard,"",@progbits
        1:
        mov $0xFEFEFEFE01010101,%rax
        2:
        .previous
        .section __immediate,"a",@progbits
         .quad __mark_kernel_sched_try_wakeup.31114+16, (3f)-8, 8
        .previous
        .org . + ((-.-(2b-1b)) & (8-1)), 0x90
        mov $0xFEFEFEFE01010101,%rax
        3:

.LVL705:
#NO_APP
.LBE2003:
        testq   %rax, %rax
        je      .L676

And the corresponding binary (for x86_64, using a 8 bytes immediate) :

    28c0:       90                      nop    
    28c1:       90                      nop    
    28c2:       90                      nop    
    28c3:       90                      nop    
    28c4:       90                      nop    
    28c5:       90                      nop    
    28c6:       48 b8 01 01 01 01 fe    mov    $0xfefefefe01010101,%rax
    28cd:       fe fe fe 
    28d0:       48 85 c0                test   %rax,%rax

The patch is at the bottom of this email.

Mathieu


Immediate Values x86 Optimization Declare Discarded Instruction

- Create the instruction in a discarded section to calculate its size. This is
  how we can align the beginning of the instruction on an address that will
  permit atomic modificatino of the immediate value without knowing the size of
  the opcode used by the compiler.
- Bugfix : 8 bytes 64 bits immediate value was declared as "4 bytes" in the
  immediate structure.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
---
 include/asm-x86/immediate.h |   93 +++++++++++++++++++++-----------------------
 1 file changed, 45 insertions(+), 48 deletions(-)

Index: linux-2.6-lttng/include/asm-x86/immediate.h
===================================================================
--- linux-2.6-lttng.orig/include/asm-x86/immediate.h	2007-11-14 13:39:39.000000000 -0500
+++ linux-2.6-lttng/include/asm-x86/immediate.h	2007-11-14 13:40:18.000000000 -0500
@@ -30,22 +30,18 @@ struct __immediate {
  * Reads the value of @name.
  * Optimized version of the immediate.
  * Do not use in __init and __exit functions. Use _immediate_read() instead.
- * Makes sure the 2 and 4 bytes update will be atomic by aligning the immediate
- * value. 2 bytes (short) uses a 66H prefix. If size is bigger than 4 bytes,
- * fall back on a memory read.
- * The 64 bits load immediates produced by gas have a 2 bytes opcode.
+ * If size is bigger than the architecture long size, fall back on a memory
+ * read.
+ *
  * Make sure to populate the initial static 64 bits opcode with a value
- * what will generate an instruction with 2 bytes opcode and 8 bytes immediate
- * value.
- * A solution that should fit for both i386 and x86_64 would be :
- * 1 byte  : "=Q" : Any register accessible as rh: a, b, c, and d.
- * 2, 4 bytes : "=R" : Legacy register—the eight integer registers available
- *               on all i386 processors (a, b, c, d, si, di, bp, sp).
- * 8 bytes : (only for x86_64)
- *              "=r" : A register operand is allowed provided that it is in a
- *              general register.
- * That should make sure x86_64 won't try to use REX prefixed opcodes for
- * 1, 2 and 4 bytes values.
+ * what will generate an instruction with 8 bytes immediate value (not the REX.W
+ * prefixed one that loads a sign extended 32 bits immediate value in a r64
+ * register).
+ *
+ * Create the instruction in a discarded section to calculate its size. This is
+ * how we can align the beginning of the instruction on an address that will
+ * permit atomic modificatino of the immediate value without knowing the size of
+ * the opcode used by the compiler. The operand size is known in advance.
  */
 #define immediate_read(name)						\
 	({								\
@@ -53,49 +49,50 @@ struct __immediate {
 		switch (sizeof(value)) {				\
 		case 1:							\
 			asm(".section __immediate,\"a\",@progbits\n\t"	\
-					_ASM_PTR "%c1, (0f)+1, 1\n\t"	\
-					".previous\n\t"			\
-					"0:\n\t"			\
-					"mov $0,%0\n\t"			\
-				: "=Q" (value)				\
-				: "i" (&name##__immediate));		\
+				_ASM_PTR "%c1, (3f)-%c2, %c2\n\t"	\
+				".previous\n\t"				\
+				"mov $0,%0\n\t"				\
+				"3:\n\t"				\
+				: "=q" (value)				\
+				: "i" (&name##__immediate),		\
+				  "i" (sizeof(value)));			\
 			break;						\
 		case 2:							\
-			asm(".section __immediate,\"a\",@progbits\n\t"	\
-					_ASM_PTR "%c1, (0f)+2, 2\n\t"	\
-					".previous\n\t"			\
-					"1:\n\t"			\
-					".align 2\n\t"			\
-					"0:\n\t"			\
-					"mov $0,%0\n\t"			\
-				: "=R" (value)				\
-				: "i" (&name##__immediate));		\
-			break;						\
 		case 4:							\
-			asm(".section __immediate,\"a\",@progbits\n\t"	\
-					_ASM_PTR "%c1, (0f)+1, 4\n\t"	\
-					".previous\n\t"			\
-					"1:\n\t"			\
-					".org . + 3 - (. & 3), 0x90\n\t" \
-					"0:\n\t"			\
-					"mov $0,%0\n\t"			\
-				: "=R" (value)				\
-				: "i" (&name##__immediate));		\
+			asm(".section __discard,\"\",@progbits\n\t"	\
+				"1:\n\t"				\
+				"mov $0,%0\n\t"				\
+				"2:\n\t"				\
+				".previous\n\t"				\
+				".section __immediate,\"a\",@progbits\n\t" \
+				_ASM_PTR "%c1, (3f)-%c2, %c2\n\t"	\
+				".previous\n\t"				\
+				".org . + ((-.-(2b-1b)) & (%c2-1)), 0x90\n\t" \
+				"mov $0,%0\n\t"				\
+				"3:\n\t"				\
+				: "=r" (value)				\
+				: "i" (&name##__immediate),		\
+				  "i" (sizeof(value)));			\
 			break;						\
 		case 8:							\
 			if (sizeof(long) < 8) {				\
 				value = name##__immediate;		\
 				break;					\
 			}						\
-			asm(".section __immediate,\"a\",@progbits\n\t"	\
-					_ASM_PTR "%c1, (0f)+1, 4\n\t"	\
-					".previous\n\t"			\
-					"1:\n\t"			\
-					".org . + 6 - (. & 7), 0x90\n\t" \
-					"0:\n\t"			\
-					"mov $0xFEFEFEFE01010101,%0\n\t" \
+			asm(".section __discard,\"\",@progbits\n\t"	\
+				"1:\n\t"				\
+				"mov $0xFEFEFEFE01010101,%0\n\t" 	\
+				"2:\n\t"				\
+				".previous\n\t"				\
+				".section __immediate,\"a\",@progbits\n\t" \
+				_ASM_PTR "%c1, (3f)-%c2, %c2\n\t"	\
+				".previous\n\t"				\
+				".org . + ((-.-(2b-1b)) & (%c2-1)), 0x90\n\t" \
+				"mov $0xFEFEFEFE01010101,%0\n\t" 	\
+				"3:\n\t"				\
 				: "=r" (value)				\
-				: "i" (&name##__immediate));		\
+				: "i" (&name##__immediate),		\
+				  "i" (sizeof(value)));			\
 			break;						\
 		default:value = name##__immediate;			\
 			break;						\
-- 
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F  BA06 3F25 A8FE 3BAE 9A68
-
Previous message: [thread] [date] [author]
Next message: [thread] [date] [author]

Messages in current thread:
[patch 5/8] Immediate Values - x86 Optimization, Mathieu Desnoyers, (Tue Nov 13, 2:58 pm)
Re: [patch 5/8] Immediate Values - x86 Optimization, Rusty Russell, (Wed Nov 14, 11:08 pm)
Re: [patch 5/8] Immediate Values - x86 Optimization, Mathieu Desnoyers, (Thu Nov 15, 12:06 am)
Re: [patch 5/8] Immediate Values - x86 Optimization, Rusty Russell, (Thu Nov 15, 12:45 am)
Re: [patch 5/8] Immediate Values - x86 Optimization, Mathieu Desnoyers, (Thu Nov 15, 1:37 am)
Re: [patch 5/8] Immediate Values - x86 Optimization, Rusty Russell, (Thu Nov 15, 7:06 am)
Re: [patch 5/8] Immediate Values - x86 Optimization (simplif..., Mathieu Desnoyers, (Fri Nov 16, 10:03 am)
Re: [patch 5/8] Immediate Values - x86 Optimization (simplif..., Mathieu Desnoyers, (Mon Nov 19, 3:15 pm)
Re: [patch 5/8] Immediate Values - x86 Optimization (simplif..., Mathieu Desnoyers, (Mon Nov 19, 10:28 am)
Re: [patch 5/8] Immediate Values - x86 Optimization (simplif..., Mathieu Desnoyers, (Tue Nov 20, 1:02 pm)
Re: [patch 5/8] Immediate Values - x86 Optimization, H. Peter Anvin, (Tue Nov 13, 3:07 pm)
Re: [patch 5/8] Immediate Values - x86 Optimization, Mathieu Desnoyers, (Tue Nov 13, 3:24 pm)
Re: [patch 5/8] Immediate Values - x86 Optimization, H. Peter Anvin, (Tue Nov 13, 3:36 pm)
Re: [patch 5/8] Immediate Values - x86 Optimization, Mathieu Desnoyers, (Tue Nov 13, 3:45 pm)
Re: [patch 5/8] Immediate Values - x86 Optimization, H. Peter Anvin, (Tue Nov 13, 3:56 pm)
Re: [patch 5/8] Immediate Values - x86 Optimization (update), Mathieu Desnoyers, (Tue Nov 13, 4:40 pm)
Re: [patch 5/8] Immediate Values - x86 Optimization (update), Mathieu Desnoyers, (Tue Nov 13, 6:02 pm)
Re: [patch 5/8] Immediate Values - x86 Optimization (update), Mathieu Desnoyers, (Tue Nov 13, 8:34 pm)
Re: [patch 5/8] Immediate Values - x86 Optimization (update 2), Mathieu Desnoyers, (Tue Nov 13, 9:44 pm)
[PATCH] Immediate Values x86 Optimization Declare Discarded ..., Mathieu Desnoyers, (Wed Nov 14, 2:52 pm)
[PATCH] Add __discard section to x86, Mathieu Desnoyers, (Wed Nov 14, 3:16 pm)
Re: [PATCH] Add __discard section to x86, H. Peter Anvin, (Wed Nov 14, 3:33 pm)
Re: [patch 5/8] Immediate Values - x86 Optimization (update 2), Mathieu Desnoyers, (Wed Nov 14, 10:16 am)