A patchset that adds more this_cpu operations and in particular RMV operations that can be used in various places to avoid address calculations and memory accesses by the user of fast cpu local operations with segment prefixes. V2 has several enhancements and bugfixes that were suggested after V1 V3 removes the cmpxchg patches and focuses on the first extensions of cpu ops that were generally an improvement. For V3 I scanned through the kernel code for obvious cases in which a __get_cpu_var or get_cpu_var can be converted to this_cpu_ops. That is often not possible because addresses of per cpu variables are needed. However, the accesses that could become converted became very cheap because this_cpu_ops typically only generate a single instruction using a segment prefix to perform the relocation to the correct per cpu area. Cpu ops perform implied address calculations. It is therefore not possible to take the address of the result of a this_cpu_xx operation. --
this cpu operations can be used to slightly optimize the function. The
changes will avoid some address calculations and replace them with the
use of the percpu segment register.
If one would have this_cpu_inc_return and this_cpu_dec_return then it
would be possible to optimize inc_zone_page_state and dec_zone_page_state even
more.
V1->V2:
- Fix __dec_zone_state overflow handling
- Use s8 variables for temporary storage.
V2->V3:
- Put __percpu annotations in correct places.
Reviewed-by: Pekka Enberg <penberg@kernel.org>
Signed-off-by: Christoph Lameter <cl@linux.com>
---
mm/vmstat.c | 56 ++++++++++++++++++++++++++++++++------------------------
1 file changed, 32 insertions(+), 24 deletions(-)
Index: linux-2.6/mm/vmstat.c
===================================================================
--- linux-2.6.orig/mm/vmstat.c 2010-11-29 10:17:28.000000000 -0600
+++ linux-2.6/mm/vmstat.c 2010-11-29 10:36:16.000000000 -0600
@@ -167,18 +167,20 @@ static void refresh_zone_stat_thresholds
void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
int delta)
{
- struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset);
-
- s8 *p = pcp->vm_stat_diff + item;
+ struct per_cpu_pageset __percpu *pcp = zone->pageset;
+ s8 __percpu *p = pcp->vm_stat_diff + item;
long x;
+ long t;
+
+ x = delta + __this_cpu_read(*p);
- x = delta + *p;
+ t = __this_cpu_read(pcp->stat_threshold);
- if (unlikely(x > pcp->stat_threshold || x < -pcp->stat_threshold)) {
+ if (unlikely(x > t || x < -t)) {
zone_page_state_add(x, zone, item);
x = 0;
}
- *p = x;
+ __this_cpu_write(*p, x);
}
EXPORT_SYMBOL(__mod_zone_page_state);
@@ -221,16 +223,19 @@ EXPORT_SYMBOL(mod_zone_page_state);
*/
void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
{
- struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset);
- s8 *p = pcp->vm_stat_diff + item;
-
- (*p)++;
+ struct per_cpu_pageset __percpu *pcp = zone->pageset;
+ s8 __percpu *p = ...Optimize various per cpu area operations through the new this cpu
operations. These operations avoid address calculations through the use
of segment prefixes avoid multiple memory references through RMW
instructions etc.
Reduces code size:
Before:
christoph@linux-2.6$ size fs/buffer.o
text data bss dec hex filename
19169 80 28 19277 4b4d fs/buffer.o
After:
christoph@linux-2.6$ size fs/buffer.o
text data bss dec hex filename
19138 80 28 19246 4b2e fs/buffer.o
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christoph Lameter <cl@linux.com>
---
fs/buffer.c | 37 ++++++++++++++++++-------------------
1 file changed, 18 insertions(+), 19 deletions(-)
Index: linux-2.6/fs/buffer.c
===================================================================
--- linux-2.6.orig/fs/buffer.c 2010-11-30 09:41:30.000000000 -0600
+++ linux-2.6/fs/buffer.c 2010-11-30 09:55:55.000000000 -0600
@@ -1270,12 +1270,10 @@ static inline void check_irqs_on(void)
static void bh_lru_install(struct buffer_head *bh)
{
struct buffer_head *evictee = NULL;
- struct bh_lru *lru;
check_irqs_on();
bh_lru_lock();
- lru = &__get_cpu_var(bh_lrus);
- if (lru->bhs[0] != bh) {
+ if (__this_cpu_read(bh_lrus.bhs[0]) != bh) {
struct buffer_head *bhs[BH_LRU_SIZE];
int in;
int out = 0;
@@ -1283,7 +1281,8 @@ static void bh_lru_install(struct buffer
get_bh(bh);
bhs[out++] = bh;
for (in = 0; in < BH_LRU_SIZE; in++) {
- struct buffer_head *bh2 = lru->bhs[in];
+ struct buffer_head *bh2 =
+ __this_cpu_read(bh_lrus.bhs[in]);
if (bh2 == bh) {
__brelse(bh2);
@@ -1298,7 +1297,7 @@ static void bh_lru_install(struct buffer
}
while (out < BH_LRU_SIZE)
bhs[out++] = NULL;
- memcpy(lru->bhs, bhs, sizeof(bhs));
+ memcpy(__this_cpu_ptr(&bh_lrus.bhs), bhs, sizeof(bhs));
}
bh_lru_unlock();
@@ -1313,23 +1312,22 @@ static struct ...Use this_cpu_ops in a couple of places in lguest.
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Christoph Lameter <cl@linux.com>
---
arch/x86/lguest/boot.c | 2 +-
drivers/lguest/page_tables.c | 2 +-
drivers/lguest/x86/core.c | 4 ++--
3 files changed, 4 insertions(+), 4 deletions(-)
Index: linux-2.6/arch/x86/lguest/boot.c
===================================================================
--- linux-2.6.orig/arch/x86/lguest/boot.c 2010-11-30 12:22:34.000000000 -0600
+++ linux-2.6/arch/x86/lguest/boot.c 2010-11-30 12:24:08.000000000 -0600
@@ -821,7 +821,7 @@ static void __init lguest_init_IRQ(void)
for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
/* Some systems map "vectors" to interrupts weirdly. Not us! */
- __get_cpu_var(vector_irq)[i] = i - FIRST_EXTERNAL_VECTOR;
+ __this_cpu_write(vector_irq[i]) = i - FIRST_EXTERNAL_VECTOR;
if (i != SYSCALL_VECTOR)
set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]);
}
Index: linux-2.6/drivers/lguest/page_tables.c
===================================================================
--- linux-2.6.orig/drivers/lguest/page_tables.c 2010-11-30 12:22:34.000000000 -0600
+++ linux-2.6/drivers/lguest/page_tables.c 2010-11-30 12:24:08.000000000 -0600
@@ -1137,7 +1137,7 @@ void free_guest_pagetable(struct lguest
*/
void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages)
{
- pte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages);
+ pte_t *switcher_pte_page = __this_cpu_read(switcher_pte_pages);
pte_t regs_pte;
#ifdef CONFIG_X86_PAE
Index: linux-2.6/drivers/lguest/x86/core.c
===================================================================
--- linux-2.6.orig/drivers/lguest/x86/core.c 2010-11-30 12:22:34.000000000 -0600
+++ linux-2.6/drivers/lguest/x86/core.c 2010-11-30 12:24:08.000000000 -0600
@@ -90,8 +90,8 @@ static void copy_in_guest_info(struct lg
* meanwhile). If that's not the case, we pretend everything in the
* Guest has ...This doesn't even compile :(
I've applied it, and applied the following fixes, too:
lguest: compile fixes
arch/x86/lguest/boot.c: In function ‘lguest_init_IRQ’:
arch/x86/lguest/boot.c:824: error: macro "__this_cpu_write" requires 2 arguments, but only 1 given
arch/x86/lguest/boot.c:824: error: ‘__this_cpu_write’ undeclared (first use in this function)
arch/x86/lguest/boot.c:824: error: (Each undeclared identifier is reported only once
arch/x86/lguest/boot.c:824: error: for each function it appears in.)
drivers/lguest/x86/core.c: In function ‘copy_in_guest_info’:
drivers/lguest/x86/core.c:94: error: lvalue required as left operand of assignment
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -821,7 +821,7 @@ static void __init lguest_init_IRQ(void)
for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
/* Some systems map "vectors" to interrupts weirdly. Not us! */
- __this_cpu_write(vector_irq[i]) = i - FIRST_EXTERNAL_VECTOR;
+ __this_cpu_write(vector_irq[i], i - FIRST_EXTERNAL_VECTOR);
if (i != SYSCALL_VECTOR)
set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]);
}
diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c
--- a/drivers/lguest/x86/core.c
+++ b/drivers/lguest/x86/core.c
@@ -91,7 +91,7 @@ static void copy_in_guest_info(struct lg
* Guest has changed.
*/
if (__this_cpu_read(lg_last_cpu) != cpu || cpu->last_pages != pages) {
- __this_cpu_read(lg_last_cpu) = cpu;
+ __this_cpu_write(lg_last_cpu, cpu);
cpu->last_pages = pages;
cpu->changed = CHANGED_ALL;
}
--
Yeah. I had to go through a lot of code and the build process did not Great. Thanks. --
Use this_cpu_ops to reduce code size and simplify things in various places.
Cc: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Cc:
Signed-off-by: Christoph Lameter <cl@linux.com>
---
arch/x86/xen/enlighten.c | 4 ++--
arch/x86/xen/multicalls.h | 2 +-
arch/x86/xen/spinlock.c | 8 ++++----
arch/x86/xen/time.c | 13 ++++++-------
drivers/xen/events.c | 10 +++++-----
5 files changed, 18 insertions(+), 19 deletions(-)
Index: linux-2.6/arch/x86/xen/enlighten.c
===================================================================
--- linux-2.6.orig/arch/x86/xen/enlighten.c 2010-11-30 12:31:39.000000000 -0600
+++ linux-2.6/arch/x86/xen/enlighten.c 2010-11-30 12:31:42.000000000 -0600
@@ -574,8 +574,8 @@ static void xen_write_idt_entry(gate_des
preempt_disable();
- start = __get_cpu_var(idt_desc).address;
- end = start + __get_cpu_var(idt_desc).size + 1;
+ start = __this_cpu_read(idt_desc.address);
+ end = start + __this_cpu_read(idt_desc.size) + 1;
xen_mc_flush();
Index: linux-2.6/arch/x86/xen/multicalls.h
===================================================================
--- linux-2.6.orig/arch/x86/xen/multicalls.h 2010-11-30 12:31:39.000000000 -0600
+++ linux-2.6/arch/x86/xen/multicalls.h 2010-11-30 12:31:42.000000000 -0600
@@ -22,7 +22,7 @@ static inline void xen_mc_batch(void)
unsigned long flags;
/* need to disable interrupts until this entry is complete */
local_irq_save(flags);
- __get_cpu_var(xen_mc_irq_flags) = flags;
+ __this_cpu_write(xen_mc_irq_flags, flags);
}
static inline struct multicall_space xen_mc_entry(size_t args)
Index: linux-2.6/arch/x86/xen/spinlock.c
===================================================================
--- linux-2.6.orig/arch/x86/xen/spinlock.c 2010-11-30 12:31:39.000000000 -0600
+++ linux-2.6/arch/x86/xen/spinlock.c 2010-11-30 12:31:42.000000000 -0600
@@ -159,8 +159,8 @@ static inline struct xen_spinlock *spinn
{
struct xen_spinlock *prev;
- prev = ...Really? What code does this generate? If this is generating two
segment-prefixed reads rather than getting the address and doing normal
reads on it, then I don't think it is an improvement.
The rest looks OK, I guess. How does it change the generated code?
J
--
Lets drop that hunk. No point to do optimizations at that location then. evt is also not defined then. Without the evt address determination via __get_cpu_var we have at least 2 prefixed load and one address calculation to figure out the parameter to pass. No win. --
__get_cpu_var() can be replaced with this_cpu_read and will then use a single read instruction with implied address calculation to access the correct per cpu instance. However, the address of a per cpu variable passed to __this_cpu_read() cannot be determed (since its an implied address conversion through segment prefixes). Therefore apply this only to uses of __get_cpu_var where the addres of the variable is not used. Cc: Neil Horman <nhorman@tuxdriver.com> Cc: Martin Schwidefsky <schwidefsky@de.ibm.com> Signed-off-by: Christoph Lameter <cl@linux.com> --- drivers/acpi/processor_idle.c | 6 +++--- drivers/char/random.c | 2 +- drivers/cpuidle/cpuidle.c | 2 +- drivers/s390/cio/cio.c | 2 +- drivers/staging/speakup/fakekey.c | 4 ++-- 5 files changed, 8 insertions(+), 8 deletions(-) Index: linux-2.6/drivers/acpi/processor_idle.c =================================================================== --- linux-2.6.orig/drivers/acpi/processor_idle.c 2010-11-30 12:40:22.000000000 -0600 +++ linux-2.6/drivers/acpi/processor_idle.c 2010-11-30 12:40:47.000000000 -0600 @@ -746,7 +746,7 @@ static int acpi_idle_enter_c1(struct cpu struct acpi_processor *pr; struct acpi_processor_cx *cx = cpuidle_get_statedata(state); - pr = __get_cpu_var(processors); + pr = __this_cpu_read(processors); if (unlikely(!pr)) return 0; @@ -787,7 +787,7 @@ static int acpi_idle_enter_simple(struct s64 idle_time_ns; s64 idle_time; - pr = __get_cpu_var(processors); + pr = __this_cpu_read(processors); if (unlikely(!pr)) return 0; @@ -864,7 +864,7 @@ static int acpi_idle_enter_bm(struct cpu s64 idle_time; - pr = __get_cpu_var(processors); + pr = __this_cpu_read(processors); if (unlikely(!pr)) return 0; Index: linux-2.6/drivers/char/random.c =================================================================== --- linux-2.6.orig/drivers/char/random.c 2010-11-30 12:40:22.000000000 -0600 +++ ...
Current_cpu_data accesses are per cpu accesses. We can also use
this_cpu_ops if a scalar is retrieved.
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Christoph Lameter <cl@linux.com>
---
arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 2 +-
arch/x86/kernel/cpu/intel_cacheinfo.c | 4 ++--
arch/x86/kernel/smpboot.c | 10 +++++-----
3 files changed, 8 insertions(+), 8 deletions(-)
Index: linux-2.6/arch/x86/kernel/smpboot.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/smpboot.c 2010-11-30 11:53:03.000000000 -0600
+++ linux-2.6/arch/x86/kernel/smpboot.c 2010-11-30 11:57:02.000000000 -0600
@@ -430,7 +430,7 @@ void __cpuinit set_cpu_sibling_map(int c
cpumask_set_cpu(cpu, c->llc_shared_map);
- if (current_cpu_data.x86_max_cores == 1) {
+ if (__this_cpu_read(cpu_info.x86_max_cores) == 1) {
cpumask_copy(cpu_core_mask(cpu), cpu_sibling_mask(cpu));
c->booted_cores = 1;
return;
@@ -1377,7 +1377,7 @@ void play_dead_common(void)
mb();
/* Ack it */
- __get_cpu_var(cpu_state) = CPU_DEAD;
+ __this_cpu_write(cpu_state, CPU_DEAD);
/*
* With physical CPU hotplug, we should halt the cpu
@@ -1401,7 +1401,7 @@ static inline void mwait_play_dead(void)
return;
if (!cpu_has(&current_cpu_data, X86_FEATURE_CLFLSH))
return;
- if (current_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
+ if (__this_cpu_read(cpu_info.cpuid_level) < CPUID_MWAIT_LEAF)
return;
eax = CPUID_MWAIT_LEAF;
@@ -1452,7 +1452,7 @@ static inline void mwait_play_dead(void)
static inline void hlt_play_dead(void)
{
- if (current_cpu_data.x86 >= 4)
+ if (__this_cpu_read(cpu_info.x86) >= 4)
wbinvd();
while (1) {
Index: linux-2.6/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/cpu/cpufreq/powernow-k8.c 2010-11-30 11:53:03.000000000 -0600
+++ ...The whole function can be expressed as a simple this_cpu_read() operation.
The function overhead is now likely multiple times that of the single
instruction that is executed in it.
Cc: William Hubbs <w.d.hubbs@gmail.com>
Signed-off-by: Christoph Lameter <cl@linux.com>
---
drivers/staging/speakup/fakekey.c | 7 +------
1 file changed, 1 insertion(+), 6 deletions(-)
Index: linux-2.6/drivers/staging/speakup/fakekey.c
===================================================================
--- linux-2.6.orig/drivers/staging/speakup/fakekey.c 2010-11-30 12:10:12.000000000 -0600
+++ linux-2.6/drivers/staging/speakup/fakekey.c 2010-11-30 12:14:12.000000000 -0600
@@ -96,10 +96,5 @@ void speakup_fake_down_arrow(void)
*/
bool speakup_fake_key_pressed(void)
{
- bool is_pressed;
-
- is_pressed = get_cpu_var(reporting_keystroke);
- put_cpu_var(reporting_keystroke);
-
- return is_pressed;
+ return this_cpu_read(reporting_keystroke);
}
--
get_seq can benefit from this_cpu_operations. Address calculation is avoided
and the increment is done using an xadd.
Cc: Scott James Remnant <scott@ubuntu.com>
Cc: Mike Frysinger <vapier@gentoo.org>
Signed-off-by: Christoph Lameter <cl@linux.com>
---
drivers/connector/cn_proc.c | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
Index: linux-2.6/drivers/connector/cn_proc.c
===================================================================
--- linux-2.6.orig/drivers/connector/cn_proc.c 2010-11-30 09:38:33.000000000 -0600
+++ linux-2.6/drivers/connector/cn_proc.c 2010-11-30 09:39:38.000000000 -0600
@@ -43,9 +43,10 @@ static DEFINE_PER_CPU(__u32, proc_event_
static inline void get_seq(__u32 *ts, int *cpu)
{
- *ts = get_cpu_var(proc_event_counts)++;
+ preempt_disable();
+ *ts = __this_cpu_inc(proc_event_counts);
*cpu = smp_processor_id();
- put_cpu_var(proc_event_counts);
+ preempt_enable();
}
void proc_fork_connector(struct task_struct *task)
--
Use this_cpu ops in various places to optimize per cpu data access.
Cc: Jason Baron <jbaron@redhat.com>
Cc: Namhyung Kim <namhyung@gmail.com>
Signed-off-by: Christoph Lameter <cl@linux.com>
---
arch/x86/kernel/kprobes.c | 14 +++++++-------
include/linux/kprobes.h | 4 ++--
kernel/kprobes.c | 8 ++++----
3 files changed, 13 insertions(+), 13 deletions(-)
Index: linux-2.6/arch/x86/kernel/kprobes.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/kprobes.c 2010-11-30 12:31:51.000000000 -0600
+++ linux-2.6/arch/x86/kernel/kprobes.c 2010-11-30 12:32:15.000000000 -0600
@@ -403,7 +403,7 @@ static void __kprobes save_previous_kpro
static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb)
{
- __get_cpu_var(current_kprobe) = kcb->prev_kprobe.kp;
+ __this_cpu_write(current_kprobe, kcb->prev_kprobe.kp);
kcb->kprobe_status = kcb->prev_kprobe.status;
kcb->kprobe_old_flags = kcb->prev_kprobe.old_flags;
kcb->kprobe_saved_flags = kcb->prev_kprobe.saved_flags;
@@ -412,7 +412,7 @@ static void __kprobes restore_previous_k
static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
struct kprobe_ctlblk *kcb)
{
- __get_cpu_var(current_kprobe) = p;
+ __this_cpu_write(current_kprobe, p);
kcb->kprobe_saved_flags = kcb->kprobe_old_flags
= (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF));
if (is_IF_modifier(p->ainsn.insn))
@@ -586,7 +586,7 @@ static int __kprobes kprobe_handler(stru
preempt_enable_no_resched();
return 1;
} else if (kprobe_running()) {
- p = __get_cpu_var(current_kprobe);
+ p = __this_cpu_read(current_kprobe);
if (p->break_handler && p->break_handler(p, regs)) {
setup_singlestep(p, regs, kcb, 0);
return 1;
@@ -759,11 +759,11 @@ static __used __kprobes void *trampoline
orig_ret_address = (unsigned long)ri->ret_addr;
if (ri->rp && ri->rp->handler) {
- __get_cpu_var(current_kprobe) = ...__get_cpu_var() can be replaced with this_cpu_read and will then use a single
read instruction with implied address calculation to access the correct per cpu
instance.
However, the address of a per cpu variable passed to __this_cpu_read() cannot be
determed (since its an implied address conversion through segment prefixes).
Therefore apply this only to uses of __get_cpu_var where the addres of the
variable is not used.
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Hugh Dickins <hughd@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Christoph Lameter <cl@linux.com>
---
include/asm-generic/irq_regs.h | 8 +++----
include/linux/elevator.h | 12 ++---------
include/linux/kernel_stat.h | 2 -
kernel/exit.c | 2 -
kernel/fork.c | 2 -
kernel/hrtimer.c | 2 -
kernel/printk.c | 4 +--
kernel/rcutree.c | 4 +--
kernel/softirq.c | 44 ++++++++++++++++++++---------------------
kernel/time/tick-common.c | 2 -
kernel/time/tick-oneshot.c | 4 +--
kernel/watchdog.c | 36 ++++++++++++++++-----------------
mm/slab.c | 6 ++---
13 files changed, 61 insertions(+), 67 deletions(-)
Index: linux-2.6/mm/slab.c
===================================================================
--- linux-2.6.orig/mm/slab.c 2010-11-30 12:40:22.000000000 -0600
+++ linux-2.6/mm/slab.c 2010-11-30 12:41:14.000000000 -0600
@@ -829,12 +829,12 @@ static void init_reap_node(int cpu)
static void next_reap_node(void)
{
- int node = __get_cpu_var(slab_reap_node);
+ int node = __this_cpu_read(slab_reap_node);
node = next_node(node, node_online_map);
if (unlikely(node >= MAX_NUMNODES))
node = first_node(node_online_map);
- __get_cpu_var(slab_reap_node) = node;
+ __this_cpu_write(slab_reap_node, node);
}
#else
@@ -1012,7 +1012,7 @@ static void __drain_alien_cache(struct k
*/
static void ...Go through x86 code and replace __get_cpu_var and get_cpu_var instances
that refer to a scalar and are not used for address determinations.
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Christoph Lameter <cl@linux.com>
---
arch/x86/include/asm/debugreg.h | 2 -
arch/x86/kernel/apic/io_apic.c | 4 +--
arch/x86/kernel/apic/nmi.c | 24 +++++++++++-----------
arch/x86/kernel/apic/x2apic_uv_x.c | 8 +++----
arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 2 -
arch/x86/kernel/cpu/mcheck/mce.c | 6 ++---
arch/x86/kernel/cpu/perf_event.c | 32 ++++++++++++------------------
arch/x86/kernel/cpu/perf_event_intel.c | 4 +--
arch/x86/kernel/ftrace.c | 6 ++---
arch/x86/kernel/hw_breakpoint.c | 12 +++++------
arch/x86/kernel/irq.c | 6 ++---
arch/x86/kernel/irq_32.c | 4 +--
arch/x86/kernel/tsc.c | 2 -
arch/x86/kvm/x86.c | 8 +++----
arch/x86/oprofile/nmi_int.c | 2 -
15 files changed, 58 insertions(+), 64 deletions(-)
Index: linux-2.6/arch/x86/include/asm/debugreg.h
===================================================================
--- linux-2.6.orig/arch/x86/include/asm/debugreg.h 2010-11-30 12:35:38.000000000 -0600
+++ linux-2.6/arch/x86/include/asm/debugreg.h 2010-11-30 12:37:10.000000000 -0600
@@ -94,7 +94,7 @@ static inline void hw_breakpoint_disable
static inline int hw_breakpoint_active(void)
{
- return __get_cpu_var(cpu_dr7) & DR_GLOBAL_ENABLE_MASK;
+ return __this_cpu_read(cpu_dr7) & DR_GLOBAL_ENABLE_MASK;
}
extern void aout_dump_debugregs(struct user *dump);
Index: linux-2.6/arch/x86/kernel/apic/io_apic.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/apic/io_apic.c 2010-11-30 12:35:38.000000000 -0600
+++ ...Use this_cpu_inc_return in one place and avoid ugly __raw_get_cpu in another.
Cc: Michael Holzheu <holzheu@linux.vnet.ibm.com>
Signed-off-by: Christoph Lameter <cl@linux.com>
---
kernel/taskstats.c | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)
Index: linux-2.6/kernel/taskstats.c
===================================================================
--- linux-2.6.orig/kernel/taskstats.c 2010-11-30 10:06:35.000000000 -0600
+++ linux-2.6/kernel/taskstats.c 2010-11-30 10:10:14.000000000 -0600
@@ -89,8 +89,7 @@ static int prepare_reply(struct genl_inf
return -ENOMEM;
if (!info) {
- int seq = get_cpu_var(taskstats_seqnum)++;
- put_cpu_var(taskstats_seqnum);
+ int seq = this_cpu_inc_return(taskstats_seqnum);
reply = genlmsg_put(skb, 0, seq, &family, 0, cmd);
} else
@@ -581,7 +580,7 @@ void taskstats_exit(struct task_struct *
fill_tgid_exit(tsk);
}
- listeners = &__raw_get_cpu_var(listener_array);
+ listeners = __this_cpu_ptr(listener_array);
if (list_empty(&listeners->list))
return;
--
Hello Christoph, Hmmm, wouldn't seq now always be one more than before? I think that "seq = get_cpu_var(taskstats_seqnum)++" first assigns taskstats_seqnum to seq and then increases the value in contrast to this_cpu_inc_return() that returns the already increased value, correct? --
Correct. We need to subtract one from that (which will eliminate the minus -1 that the inline this_cpu_inc_return creates). --
But that breaks current behaviour, we should probably initialize all of the array to -1? -- Three Cheers, Balbir --
Not necessary. This_cpu_inc() uses an xadd instruction which retrieves the value and then increments the memory location. Then it adds 1. The -1 eliminates that add. --
Use this_cpu operations to optimize access primitives for highmem.
The main effect is the avoidance of address calculations through the
use of a segment prefix.
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Christoph Lameter <cl@linux.com>
---
include/linux/highmem.h | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
Index: linux-2.6/include/linux/highmem.h
===================================================================
--- linux-2.6.orig/include/linux/highmem.h 2010-11-22 14:43:40.000000000 -0600
+++ linux-2.6/include/linux/highmem.h 2010-11-22 14:45:02.000000000 -0600
@@ -81,7 +81,8 @@ DECLARE_PER_CPU(int, __kmap_atomic_idx);
static inline int kmap_atomic_idx_push(void)
{
- int idx = __get_cpu_var(__kmap_atomic_idx)++;
+ int idx = __this_cpu_inc_return(__kmap_atomic_idx) - 1;
+
#ifdef CONFIG_DEBUG_HIGHMEM
WARN_ON_ONCE(in_irq() && !irqs_disabled());
BUG_ON(idx > KM_TYPE_NR);
@@ -91,12 +92,12 @@ static inline int kmap_atomic_idx_push(v
static inline int kmap_atomic_idx(void)
{
- return __get_cpu_var(__kmap_atomic_idx) - 1;
+ return __this_cpu_read(__kmap_atomic_idx) - 1;
}
static inline int kmap_atomic_idx_pop(void)
{
- int idx = --__get_cpu_var(__kmap_atomic_idx);
+ int idx = __this_cpu_dec_return(__kmap_atomic_idx);
#ifdef CONFIG_DEBUG_HIGHMEM
BUG_ON(idx < 0);
#endif
--
You could change kmap_atomic_idx_pop() to return void, and use __this_cpu_dec(__kmap_atomic_idx) --
You can do the void change unconditionally, the debug code already uses
kmap_atomic_idx() because of:
---
commit 20273941f2129aa5a432796d98a276ed73d60782
Author: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed Oct 27 15:32:58 2010 -0700
mm: fix race in kunmap_atomic()
Christoph reported a nice splat which illustrated a race in the new stack
based kmap_atomic implementation.
The problem is that we pop our stack slot before we're completely done
resetting its state -- in particular clearing the PTE (sometimes that's
CONFIG_DEBUG_HIGHMEM). If an interrupt happens before we actually clear
the PTE used for the last slot, that interrupt can reuse the slot in a
dirty state, which triggers a BUG in kmap_atomic().
Fix this by introducing kmap_atomic_idx() which reports the current slot
index without actually releasing it and use that to find the PTE and delay
the _pop() until after we're completely done.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Reported-by: Christoph Hellwig <hch@infradead.org>
Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
diff --git a/arch/arm/mm/highmem.c b/arch/arm/mm/highmem.c
index c00f119..c435fd9 100644
--- a/arch/arm/mm/highmem.c
+++ b/arch/arm/mm/highmem.c
@@ -89,7 +89,7 @@ void __kunmap_atomic(void *kvaddr)
int idx, type;
if (kvaddr >= (void *)FIXADDR_START) {
- type = kmap_atomic_idx_pop();
+ type = kmap_atomic_idx();
idx = type + KM_TYPE_NR * smp_processor_id();
if (cache_is_vivt())
@@ -101,6 +101,7 @@ void __kunmap_atomic(void *kvaddr)
#else
(void) idx; /* to kill a warning */
#endif
+ kmap_atomic_idx_pop();
} else if (vaddr >= PKMAP_ADDR(0) && vaddr < PKMAP_ADDR(LAST_PKMAP)) {
/* this address was obtained through kmap_high_get() */
...The following would do?
Subject: highmem: Use this_cpu_dec instead of __this_cpu_dec_return if !DEBUG_HIGHMEM
Signed-off-by: Christoph Lameter <cl@linux.com>
---
include/linux/highmem.h | 9 +++++++--
1 file changed, 7 insertions(+), 2 deletions(-)
Index: linux-2.6/include/linux/highmem.h
===================================================================
--- linux-2.6.orig/include/linux/highmem.h 2010-11-30 13:23:44.000000000 -0600
+++ linux-2.6/include/linux/highmem.h 2010-11-30 13:24:54.000000000 -0600
@@ -95,14 +95,19 @@ static inline int kmap_atomic_idx(void)
return __this_cpu_read(__kmap_atomic_idx) - 1;
}
+#ifdef CONFIG_DEBUG_HIGHMEM
static inline int kmap_atomic_idx_pop(void)
{
int idx = __this_cpu_dec_return(__kmap_atomic_idx);
-#ifdef CONFIG_DEBUG_HIGHMEM
BUG_ON(idx < 0);
-#endif
return idx;
}
+#else
+static inline void kmap_atomic_idx_pop(void)
+{
+ __this_cpu_dec(__kmap_atomic_idx);
+}
+#endif
#endif
--
well maybe a single prototype ;)
static inline void kmap_atomic_idx_pop(void)
{
#ifdef CONFIG_DEBUG_HIGHMEM
int idx = __this_cpu_dec_return(__kmap_atomic_idx);
BUG_ON(idx < 0);
#else
__this_cpu_dec(__kmap_atomic_idx);
#endif
}
--
Right, at least a consistent prototype, the above looks fine to me. --
Ok with right spacing this is:
Subject: highmem: Use this_cpu_dec instead of __this_cpu_dec_return if
!DEBUG_HIGHMEM
Signed-off-by: Christoph Lameter <cl@linux.com>
---
include/linux/highmem.h | 8 +++++---
1 file changed, 5 insertions(+), 3 deletions(-)
Index: linux-2.6/include/linux/highmem.h
===================================================================
--- linux-2.6.orig/include/linux/highmem.h 2010-11-30 13:23:44.000000000 -0600
+++ linux-2.6/include/linux/highmem.h 2010-11-30 13:51:39.000000000 -0600
@@ -95,13 +95,15 @@ static inline int kmap_atomic_idx(void)
return __this_cpu_read(__kmap_atomic_idx) - 1;
}
-static inline int kmap_atomic_idx_pop(void)
+static inline void kmap_atomic_idx_pop(void)
{
- int idx = __this_cpu_dec_return(__kmap_atomic_idx);
#ifdef CONFIG_DEBUG_HIGHMEM
+ int idx = __this_cpu_dec_return(__kmap_atomic_idx);
+
BUG_ON(idx < 0);
+#else
+ __this_cpu_dec(__kmap_atomic_idx);
#endif
- return idx;
}
#endif
--
Introduce generic support for this_cpu_add_return etc.
The fallback is to realize these operations with simpler __this_cpu_ops.
Reviewed-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Christoph Lameter <cl@linux.com>
---
include/linux/percpu.h | 77 +++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 77 insertions(+)
Index: linux-2.6/include/linux/percpu.h
===================================================================
--- linux-2.6.orig/include/linux/percpu.h 2010-11-29 10:19:28.000000000 -0600
+++ linux-2.6/include/linux/percpu.h 2010-11-29 10:24:57.000000000 -0600
@@ -240,6 +240,25 @@ extern void __bad_size_call_parameter(vo
pscr_ret__; \
})
+#define __pcpu_size_call_return2(stem, pcp, ...) \
+({ \
+ typeof(pcp) ret__; \
+ __verify_pcpu_ptr(&(pcp)); \
+ switch(sizeof(pcp)) { \
+ case 1: ret__ = stem##1(pcp, __VA_ARGS__); \
+ break; \
+ case 2: ret__ = stem##2(pcp, __VA_ARGS__); \
+ break; \
+ case 4: ret__ = stem##4(pcp, __VA_ARGS__); \
+ break; \
+ case 8: ret__ = stem##8(pcp, __VA_ARGS__); \
+ break; \
+ default: \
+ __bad_size_call_parameter();break; \
+ } \
+ ret__; \
+})
+
#define __pcpu_size_call(stem, variable, ...) \
do { \
__verify_pcpu_ptr(&(variable)); \
@@ -529,6 +548,64 @@ do { \
# define __this_cpu_xor(pcp, val) __pcpu_size_call(__this_cpu_xor_, (pcp), (val))
#endif
+#define _this_cpu_generic_add_return(pcp, val) \
+({ \
+ typeof(pcp) ret__; \
+ preempt_disable(); \
+ __this_cpu_add(pcp, val); \
+ ret__ = __this_cpu_read(pcp); \
+ preempt_enable(); \
+ ret__; \
+})
+
+#ifndef this_cpu_add_return
+# ifndef this_cpu_add_return_1
+# define this_cpu_add_return_1(pcp, val) _this_cpu_generic_add_return(pcp, val)
+# endif
+# ifndef this_cpu_add_return_2
+# define this_cpu_add_return_2(pcp, ...this_cpu_inc_return() saves us a memory access there. Code
size does not change.
V1->V2:
- Fixed the location of the __per_cpu pointer attributes
- Sparse checked
V2->V3:
- Move fixes to __percpu attribute usage to earlier patch
Reviewed-by: Pekka Enberg <penberg@kernel.org>
Signed-off-by: Christoph Lameter <cl@linux.com>
---
mm/vmstat.c | 8 ++------
1 file changed, 2 insertions(+), 6 deletions(-)
Index: linux-2.6/mm/vmstat.c
===================================================================
--- linux-2.6.orig/mm/vmstat.c 2010-11-29 10:36:16.000000000 -0600
+++ linux-2.6/mm/vmstat.c 2010-11-29 10:38:50.000000000 -0600
@@ -227,9 +227,7 @@ void __inc_zone_state(struct zone *zone,
s8 __percpu *p = pcp->vm_stat_diff + item;
s8 v, t;
- __this_cpu_inc(*p);
-
- v = __this_cpu_read(*p);
+ v = __this_cpu_inc_return(*p);
t = __this_cpu_read(pcp->stat_threshold);
if (unlikely(v > t)) {
s8 overstep = t >> 1;
@@ -251,9 +249,7 @@ void __dec_zone_state(struct zone *zone,
s8 __percpu *p = pcp->vm_stat_diff + item;
s8 v, t;
- __this_cpu_dec(*p);
-
- v = __this_cpu_read(*p);
+ v = __this_cpu_dec_return(*p);
t = __this_cpu_read(pcp->stat_threshold);
if (unlikely(v < - t)) {
s8 overstep = t >> 1;
--
this_cpu_inc_return() saves us a memory access there. Reviewed-by: Tejun Heo <tj@kernel.org> Reviewed-by: Pekka Enberg <penberg@kernel.org> Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Signed-off-by: Christoph Lameter <cl@linux.com> --- arch/x86/kernel/apic/nmi.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) Index: linux-2.6/arch/x86/kernel/apic/nmi.c =================================================================== --- linux-2.6.orig/arch/x86/kernel/apic/nmi.c 2010-11-23 16:35:19.000000000 -0600 +++ linux-2.6/arch/x86/kernel/apic/nmi.c 2010-11-23 16:38:29.000000000 -0600 @@ -432,8 +432,7 @@ nmi_watchdog_tick(struct pt_regs *regs, * Ayiee, looks like this CPU is stuck ... * wait a few IRQs (5 seconds) before doing the oops ... */ - __this_cpu_inc(alert_counter); - if (__this_cpu_read(alert_counter) == 5 * nmi_hz) + if (__this_cpu_inc_return(alert_counter) == 5 * nmi_hz) /* * die_nmi will return ONLY if NOTIFY_STOP happens.. */ --
Supply an implementation for x86 in order to generate more efficient code.
V2->V3:
- Cleanup
- Remove strange type checking from percpu_add_return_op.
Signed-off-by: Christoph Lameter <cl@linux.com>
---
arch/x86/include/asm/percpu.h | 46 ++++++++++++++++++++++++++++++++++++++++++
1 file changed, 46 insertions(+)
Index: linux-2.6/arch/x86/include/asm/percpu.h
===================================================================
--- linux-2.6.orig/arch/x86/include/asm/percpu.h 2010-11-29 14:29:13.000000000 -0600
+++ linux-2.6/arch/x86/include/asm/percpu.h 2010-11-30 08:42:02.000000000 -0600
@@ -177,6 +177,41 @@ do { \
} \
} while (0)
+
+/*
+ * Add return operation
+ */
+#define percpu_add_return_op(var, val) \
+({ \
+ typedef typeof(var) pao_T__; \
+ typeof(var) ret__ = val; \
+ switch (sizeof(var)) { \
+ case 1: \
+ asm("xaddb %0, "__percpu_arg(1) \
+ : "+q" (ret__), "+m" (var) \
+ : : "memory"); \
+ break; \
+ case 2: \
+ asm("xaddw %0, "__percpu_arg(1) \
+ : "+r" (ret__), "+m" (var) \
+ : : "memory"); \
+ break; \
+ case 4: \
+ asm("xaddl %0, "__percpu_arg(1) \
+ : "+r"(ret__), "+m" (var) \
+ : : "memory"); \
+ break; \
+ case 8: \
+ asm("xaddq %0, "__percpu_arg(1) \
+ : "+re" (ret__), "+m" (var) \
+ : : "memory"); \
+ break; \
+ default: __bad_percpu_size(); \
+ } \
+ ret__ += val; \
+ ret__; \
+})
+
#define percpu_from_op(op, var, constraint) \
({ \
typeof(var) pfo_ret__; \
@@ -300,6 +335,14 @@ do { \
#define irqsafe_cpu_xor_2(pcp, val) percpu_to_op("xor", (pcp), val)
#define irqsafe_cpu_xor_4(pcp, val) percpu_to_op("xor", (pcp), val)
+#ifndef CONFIG_M386
+#define __this_cpu_add_return_1(pcp, val) percpu_add_return_op(pcp, val)
+#define __this_cpu_add_return_2(pcp, val) percpu_add_return_op(pcp, ...The this_cpu_* options can be used to optimize __percpu_counter_add a bit. Avoids some address arithmetic and saves 12 bytes. Before: 00000000000001d3 <__percpu_counter_add>: 1d3: 55 push %rbp 1d4: 48 89 e5 mov %rsp,%rbp 1d7: 41 55 push %r13 1d9: 41 54 push %r12 1db: 53 push %rbx 1dc: 48 89 fb mov %rdi,%rbx 1df: 48 83 ec 08 sub $0x8,%rsp 1e3: 4c 8b 67 30 mov 0x30(%rdi),%r12 1e7: 65 4c 03 24 25 00 00 add %gs:0x0,%r12 1ee: 00 00 1f0: 4d 63 2c 24 movslq (%r12),%r13 1f4: 48 63 c2 movslq %edx,%rax 1f7: 49 01 f5 add %rsi,%r13 1fa: 49 39 c5 cmp %rax,%r13 1fd: 7d 0a jge 209 <__percpu_counter_add+0x36> 1ff: f7 da neg %edx 201: 48 63 d2 movslq %edx,%rdx 204: 49 39 d5 cmp %rdx,%r13 207: 7f 1e jg 227 <__percpu_counter_add+0x54> 209: 48 89 df mov %rbx,%rdi 20c: e8 00 00 00 00 callq 211 <__percpu_counter_add+0x3e> 211: 4c 01 6b 18 add %r13,0x18(%rbx) 215: 48 89 df mov %rbx,%rdi 218: 41 c7 04 24 00 00 00 movl $0x0,(%r12) 21f: 00 220: e8 00 00 00 00 callq 225 <__percpu_counter_add+0x52> 225: eb 04 jmp 22b <__percpu_counter_add+0x58> 227: 45 89 2c 24 mov %r13d,(%r12) 22b: 5b pop %rbx 22c: 5b pop %rbx 22d: 41 5c pop %r12 22f: 41 5d pop %r13 231: c9 leaveq 232: c3 retq After: 00000000000001d3 <__percpu_counter_add>: 1d3: 55 push %rbp 1d4: 48 63 ca movslq %edx,%rcx 1d7: 48 89 e5 mov %rsp,%rbp 1da: 41 54 push %r12 1dc: 53 push %rbx 1dd: 48 89 fb ...
Subject: timers: Use this_cpu_read
Eric asked for this.
Signed-off-by: Christoph Lameter <cl@linux.com>
---
kernel/timer.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
Index: linux-2.6/kernel/timer.c
===================================================================
--- linux-2.6.orig/kernel/timer.c 2010-11-30 14:01:41.000000000 -0600
+++ linux-2.6/kernel/timer.c 2010-11-30 14:02:05.000000000 -0600
@@ -1249,7 +1249,7 @@ static unsigned long cmp_next_hrtimer_ev
*/
unsigned long get_next_timer_interrupt(unsigned long now)
{
- struct tvec_base *base = __get_cpu_var(tvec_bases);
+ struct tvec_base *base = __this_cpu_read(tvec_bases);
unsigned long expires;
spin_lock(&base->lock);
@@ -1292,7 +1292,7 @@ void update_process_times(int user_tick)
*/
static void run_timer_softirq(struct softirq_action *h)
{
- struct tvec_base *base = __get_cpu_var(tvec_bases);
+ struct tvec_base *base = __this_cpu_read(tvec_bases);
hrtimer_run_pending();
--
Commit-ID: 7496351ad87e61e96b49dd7b43c6534e3401f566 Gitweb: http://git.kernel.org/tip/7496351ad87e61e96b49dd7b43c6534e3401f566 Author: Christoph Lameter <cl@linux.com> AuthorDate: Tue, 30 Nov 2010 14:05:53 -0600 Committer: Thomas Gleixner <tglx@linutronix.de> CommitDate: Sun, 12 Dec 2010 18:38:09 +0100 timers: Use this_cpu_read Eric asked for this. [tglx: Because it generates faster code according to Erics ] Signed-off-by: Christoph Lameter <cl@linux.com> Cc: Pekka Enberg <penberg@cs.helsinki.fi> Cc: Eric Dumazet <eric.dumazet@gmail.com> Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Cc: Tejun Heo <tj@kernel.org> Cc: linux-mm@kvack.org LKML-Reference: <alpine.DEB.2.00.1011301404490.4039@router.home> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- kernel/timer.c | 4 ++-- 1 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/timer.c b/kernel/timer.c index 483e54b..beb97fd 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1227,7 +1227,7 @@ static unsigned long cmp_next_hrtimer_event(unsigned long now, */ unsigned long get_next_timer_interrupt(unsigned long now) { - struct tvec_base *base = __get_cpu_var(tvec_bases); + struct tvec_base *base = __this_cpu_read(tvec_bases); unsigned long expires; spin_lock(&base->lock); @@ -1267,7 +1267,7 @@ void update_process_times(int user_tick) */ static void run_timer_softirq(struct softirq_action *h) { - struct tvec_base *base = __get_cpu_var(tvec_bases); + struct tvec_base *base = __this_cpu_read(tvec_bases); hrtimer_run_pending(); --
