Index: linux/Documentation/RCU/proc.txt =================================================================== --- /dev/null +++ linux/Documentation/RCU/proc.txt @@ -0,0 +1,207 @@ +/proc Filesystem Entries for RCU + + +CONFIG_RCU_STATS + +The CONFIG_RCU_STATS config option is available only in conjunction with +CONFIG_PREEMPT_RCU. It makes four /proc entries available, namely: rcuctrs, +rcuptrs, rcugp, and rcustats. + +/proc/rcuctrs + + CPU last cur + 0 1 1 + 1 1 1 + 2 1 1 + 3 0 2 + ggp = 230725 + +This displays the number of processes that started RCU read-side critical +sections on each CPU. In absence of preemption, the "last" and "cur" +counts for a given CPU will always sum to one. Therefore, in the example +output above, each CPU has started one RCU read-side critical section +that was later preempted. The "last" column counts RCU read-side critical +sections that started prior to the last counter flip, while the "cur" +column counts critical sections that started after the last counter flip. + +The "ggp" count is a count of the number of counter flips since boot. +Since this is shown as an odd number, the "cur" counts are stored in +the zero-th element of each of the per-CPU arrays, and the "last" counts +are stored in the first element of each of the per-CPU arrays. + + +/proc/rcuptrs + + nl=c04c7160/c04c7960 nt=c04c72d0 + wl=c04c7168/c04c794c wt=c04c72bc dl=c04c7170/00000000 dt=c04c7170 + +This displays the head and tail of each of CONFIG_PREEMPT_RCU's three +callback lists. This will soon change to display this on a per-CPU +basis, since each CPU will soon have its own set of callback lists. +In the example above, the "next" list header is located at hex address +0xc04c7160, the first element on the list at hex address 0xc04c7960, +and the last element on the list at hex address 0xc04c72d0. The "wl=" +and "wt=" output is similar for the "wait" list, and the "dl=" and "dt=" +output for the "done" list. The "done" list is normally emptied very +quickly after being filled, so will usually be empty as shown above. +Note that the tail pointer points into the list header in this case. + +Callbacks are placed in the "next" list by call_rcu(), moved to the +"wait" list after the next counter flip, and moved to the "done" list +on the counter flip after that. Once on the "done" list, the callbacks +are invoked. + + +/proc/rcugp + + oldggp=241419 newggp=241421 + +This entry invokes synchronize_rcu() and prints out the number of counter +flips since boot before and after the synchronize_rcu(). These two +numbers will always differ by at least two. Unless RCU is broken. ;-) + + +/proc/rcustats + + ggp=242416 lgp=242416 sr=0 rcc=396233 + na=2090938 nl=9 wa=2090929 wl=9 dl=0 dr=2090920 di=2090920 + rtf1=22230730 rtf2=20139162 rtf3=242416 rtfe1=2085911 rtfe2=5657 rtfe3=19896746 + +The quantities printed are as follows: + +o "ggp=": The number of flips since boot. + +o "lgp=": The number of flips sensed by the local structure since + boot. This will soon be per-CPU. + +o "sr=": The number of explicit call to synchronize_rcu(). + Except that this is currently broken, so always reads as zero. + It is likely to be removed... + +o "rcc=": The number of calls to rcu_check_callbacks(). + +o "na=": The number of callbacks that call_rcu() has registered + since boot. + +o "nl=": The number of callbacks currently on the "next" list. + +o "wa=": The number of callbacks that have moved to the "wait" + list since boot. + +o "wl=": The number of callbacks currently on the "wait" list. + +o "da=": The number of callbacks that have been moved to the + "done" list since boot. + +o "dl=": The number of callbacks currently on the "done" list. + +o "dr=": The number of callbacks that have been removed from the + "done" list since boot. + +o "di=": The number of callbacks that have been invoked after being + removed from the "done" list. + +o "rtf1=": The number of attempts to flip the counters. + +o "rtf2=": The number of attempts to flip the counters that successfully + acquired the fliplock. + +o "rtf3=": The number of successful counter flips. + +o "rtfe1=": The number of attempts to flip the counters that failed + due to the lock being held by someone else. + +o "rtfe2=": The number of attempts to flip the counters that were + abandoned due to someone else doing the job for us. + +o "rtfe3=": The number of attempts to flip the counters that failed + due to some task still being in an RCU read-side critical section + starting from before the last successful counter flip. + + +CONFIG_RCU_TORTURE_TEST + +The CONFIG_RCU_TORTURE_TEST config option is available for all RCU +implementations. It makes three /proc entries available, namely: rcutw, +rcutr, and rcuts. + + +/proc/rcutw + +Reading this entry starts a new torture test, or ends an earlier one +if one is already in progress (in other words, there can be only one +writer at a time). This sleeps uninterruptibly, so be sure to run +it in the background. One could argue that it would be good to have +multiple writers, but Linux uses RCU heavily enough that you will get +write-side contention whether you want it or not. If you want additional +write-side contention, repeatedly create and destroy several large file +trees in parallel. Or use some other RCU-protected update. + + +/proc/rcutr + +Reading this entry starts a new torture reader, which runs until sent +a signal (e.g., control-C). If testing an RCU implementation with +preemptible read-side critical sections, make sure to spawn at least +two /proc/rcutr instances for each CPU. + + +/proc/rcuts + +Displays the current state of the torture test: + + ggp = 20961 + rtc: c04496f4 ver: 8734 tfle: 0 rta: 8734 rtaf: 0 rtf: 8715 + Reader Pipe: 88024120 63914 0 0 0 0 0 0 0 0 0 + Reader Batch: 88024097 63937 0 0 0 0 0 0 0 0 + Free-Block Circulation: 8733 8731 8729 8727 8725 8723 8721 8719 8717 8715 0 + +The entries are as follows: + +o "ggp": The number of counter flips (or batches) since boot. + +o "rtc": The hexadecimal address of the structure currently visible + to readers. + +o "ver": The number of times since boot that the rcutw writer task + has changed the structure visible to readers. + +o "tfle": If non-zero, indicates that the "torture freelist" + containing structure to be placed into the "rtc" area is empty. + This condition is important, since it can fool you into thinking + that RCU is working when it is not. :-/ + +o "rta": Number of structures allocated from the torture freelist. + +o "rtaf": Number of allocations from the torture freelist that have + failed due to the list being empty. + +o "rtf": Number of frees into the torture freelist. + +o "Reader Pipe": Histogram of "ages" of structures seen by readers. + If any entries past the first two are non-zero, RCU is broken. + And /proc/rcuts prints "!!!" to make sure you notice. The age + of a newly allocated structure is zero, it becomes one when + removed from reader visibility, and is incremented once per + grace period subsequently -- and is freed after passing through + (RCU_TORTURE_PIPE_LEN-2) grace periods. + + The output displayed above was taken from a correctly working + RCU. If you want to see what it looks like when broken, break + it yourself. ;-) + +o "Reader Batch": Another histogram of "ages" of structures seen + by readers, but in terms of counter flips (or batches) rather + than in terms of grace periods. The legal number of non-zero + entries is again two. The reason for this separate view is + that it is easier to get the third entry to show up in the + "Reader Batch" list than in the "Reader Pipe" list. + +o "Free-Block Circulation": Shows the number of torture structures + that have reached a given point in the pipeline. The first element + should closely correspond to the number of structures allocated, + the second to the number that have been removed from reader view, + and all but the last remaining to the corresponding number of + passes through a grace period. The last entry should be zero, + as it is only incremented if a torture structure's counter + somehow gets incremented farther than it should. Index: linux/MAINTAINERS =================================================================== --- linux.orig/MAINTAINERS +++ linux/MAINTAINERS @@ -960,6 +960,12 @@ L: linux-nvidia@lists.surfsouth.com W: http://drama.obuda.kando.hu/~fero/cgi-bin/hgafb.shtml S: Maintained +High-Res-Timers (HRT) extension to Posix Clocks & Timers +P: George Anzinger +M: george@mvista.com +L: linux-net@vger.kernel.org +S: Supported + HIGH-SPEED SCC DRIVER FOR AX.25 P: Klaus Kudielka M: klaus.kudielka@ieee.org Index: linux/Makefile =================================================================== --- linux.orig/Makefile +++ linux/Makefile @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 6 SUBLEVEL = 13 -EXTRAVERSION =-rc7 +EXTRAVERSION =-rc7-rt3 NAME=Woozy Numbat # *DOCUMENTATION* @@ -517,10 +517,14 @@ CFLAGS += $(call add-align,CONFIG_CC_AL CFLAGS += $(call add-align,CONFIG_CC_ALIGN_LOOPS,-loops) CFLAGS += $(call add-align,CONFIG_CC_ALIGN_JUMPS,-jumps) -ifdef CONFIG_FRAME_POINTER -CFLAGS += -fno-omit-frame-pointer $(call cc-option,-fno-optimize-sibling-calls,) +ifdef CONFIG_MCOUNT +CFLAGS += -pg -fno-omit-frame-pointer $(call cc-option,-fno-optimize-sibling-calls,) else -CFLAGS += -fomit-frame-pointer + ifdef CONFIG_FRAME_POINTER + CFLAGS += -fno-omit-frame-pointer $(call cc-option,-fno-optimize-sibling-calls,) + else + CFLAGS += -fomit-frame-pointer + endif endif ifdef CONFIG_DEBUG_INFO Index: linux/arch/arm/mach-pxa/corgi_ssp.c =================================================================== --- linux.orig/arch/arm/mach-pxa/corgi_ssp.c +++ linux/arch/arm/mach-pxa/corgi_ssp.c @@ -22,7 +22,7 @@ #include #include -static spinlock_t corgi_ssp_lock = SPIN_LOCK_UNLOCKED; +static DEFINE_SPINLOCK(corgi_ssp_lock); static struct ssp_dev corgi_ssp_dev; static struct ssp_state corgi_ssp_state; Index: linux/arch/i386/Kconfig =================================================================== --- linux.orig/arch/i386/Kconfig +++ linux/arch/i386/Kconfig @@ -368,16 +368,6 @@ config X86_L1_CACHE_SHIFT default "5" if MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODEGX1 default "6" if MK7 || MK8 || MPENTIUMM -config RWSEM_GENERIC_SPINLOCK - bool - depends on M386 - default y - -config RWSEM_XCHGADD_ALGORITHM - bool - depends on !M386 - default y - config GENERIC_CALIBRATE_DELAY bool default y @@ -434,7 +424,7 @@ config X86_USE_PPRO_CHECKSUM config X86_USE_3DNOW bool - depends on MCYRIXIII || MK7 + depends on (MCYRIXIII || MK7) && !PREEMPT_RT default y config X86_OOSTORE @@ -442,6 +432,17 @@ config X86_OOSTORE depends on (MWINCHIP3D || MWINCHIP2 || MWINCHIPC6) && MTRR default y +choice + prompt "Clock & Timer Selection" + default LEGACY_TIMER + help + You may have either HPET, High Resolution, or Legacy timer support. + +config LEGACY_TIMER + bool "Legacy Timer Support" + help + This chooses the legacy 8254 (PIT) for timer support. + config HPET_TIMER bool "HPET Timer Support" help @@ -451,7 +452,112 @@ config HPET_TIMER activated if the platform and the BIOS support this feature. Otherwise the 8254 will be used for timing services. - Choose N to continue using the legacy 8254 timer. +config HIGH_RES_TIMERS + bool "High Resolution Timer Support" + help + This option enables high resolution POSIX clocks and timers with + resolution of at least 1 microsecond. High resolution timers are not + free, a small overhead is incurred each time a timer that does not + expire on a 1/HZ tick boundary is used. If no such timers are used + the overhead is nil. + + The POSIX clocks CLOCK_REALTIME and CLOCK_MONOTONIC are available by + default. This option enables two additional clocks, CLOCK_REALTIME_HR + and CLOCK_MONOTONIC_HR. Note that this option does not change the + resolution of CLOCK_REALTIME or CLOCK_MONOTONIC, which remain at 1/HZ + resolution. + +endchoice + +choice + prompt "High Resolution Timer clock source" + depends on HIGH_RES_TIMERS + default HIGH_RES_TIMER_TSC + help + This option allows you to choose the wall clock timer for your + system. With high resolution timers on the x86 platforms it + is best to keep the interrupt-generating timer separate from + the time-keeping timer. On x86 platforms there are two + possible sources implemented for the wall clock. These are: + + + ACPI power management (pm) timer ~280 nanoseconds + TSC (Time Stamp Counter) 1/CPU clock + + The PIT is always used to generate clock interrupts, but in + SMP systems the APIC timers are used to drive the timer list + code. This means that in SMP systems the PIT will not be + programmed to generate arch_cycle events and can give + reasonable service as the clock interrupt. In non-SMP (UP) + systems it will be programmed to interrupt when the next timer + is to expire or on the next 1/HZ tick. + + The TSC runs at the CPU clock rate (i.e. its resolution is + 1/CPU clock) and it has a very low access time. However, it + is subject, in some (incorrect) processors, to throttling to + cool the CPU, and to other slowdowns during power management. + If your system has power managment code active these changes + are tracked by the TSC timer code. If your CPU is correct and + does not change the TSC frequency for throttling or power + management outside of the power managment kernel code, this is + the best clock timer. + + The ACPI pm timer is available on systems with Advanced + Configuration and Power Interface support. The pm timer is + available on these systems even if you don't use or enable + ACPI in the software or the BIOS (but see Default ACPI pm + timer address). The timer has a resolution of about 280 + nanoseconds, however, the access time is a bit higher than + that of the TSC. Since it is part of ACPI it is intended to + keep track of time while the system is under power management, + thus it is not subject to the power management problems of the + TSC. + + If you enable the ACPI pm timer and it cannot be found, it is + possible that your BIOS is not producing the ACPI table or + that your machine does not support ACPI. In the former case, + see "Default ACPI pm timer address". If the timer is not + found the boot will fail when trying to calibrate the 'delay' + loop. + +# config HIGH_RES_TIMER_ACPI_PM +# bool "ACPI-pm-timer" + +config HIGH_RES_TIMER_TSC + bool "Time-stamp-counter/TSC" + depends on X86_TSC + +endchoice + +config HIGH_RES_RESOLUTION + int "High Resolution Timer resolution (nanoseconds)" + depends on HIGH_RES_TIMERS + default 1000 + help + This sets the resolution in nanoseconds of the CLOCK_REALTIME_HR and + CLOCK_MONOTONIC_HR timers. Too fine a resolution (small a number) + will usually not be observable due to normal system latencies. For an + 800 MHz processor about 10,000 (10 microseconds) is recommended as a + finest resolution. If you don't need that sort of resolution, + larger values may generate less overhead. + +config HIGH_RES_TIMER_ACPI_PM_ADD + int "Default ACPI pm timer address" + depends on HIGH_RES_TIMER_ACPI_PM + default 0 + help + This option is available for use on systems where the BIOS + does not generate the ACPI tables if ACPI is not enabled. For + example some BIOSes will not generate the ACPI tables if APM + is enabled. The ACPI pm timer is still available but cannot + be found by the software. This option allows you to supply + the needed address. When the high resolution timers code + finds a valid ACPI pm timer address it reports it in the boot + messages log (look for lines that begin with + "High-res-timers:"). You can turn on the ACPI support in the + BIOS, boot the system and find this value. You can then enter + it at configure time. Both the report and the entry are in + decimal. config HPET_EMULATE_RTC bool @@ -513,6 +619,20 @@ config SCHED_SMT source "kernel/Kconfig.preempt" +config RWSEM_GENERIC_SPINLOCK + bool + depends on M386 || PREEMPT_RT + default y + +config ASM_SEMAPHORES + bool + default y + +config RWSEM_XCHGADD_ALGORITHM + bool + depends on !RWSEM_GENERIC_SPINLOCK && !PREEMPT_RT + default y + config X86_UP_APIC bool "Local APIC support on uniprocessors" depends on !SMP && !(X86_VISWS || X86_VOYAGER) @@ -548,6 +668,16 @@ config X86_IO_APIC depends on X86_UP_IOAPIC || (SMP && !(X86_VISWS || X86_VOYAGER)) default y +config X86_IOAPIC_FAST + bool "enhanced IO-APIC support" + depends on X86_IO_APIC + default y + help + this option will activate further optimizations in the IO-APIC + code. NOTE: this is experimental code, and disabled by default. + Symptoms of non-working systems are boot-time lockups, stray or + screaming interrupts and other interrupt related weirdnesses. + config X86_VISWS_APIC bool depends on X86_VISWS @@ -913,7 +1043,7 @@ config BOOT_IOREMAP config REGPARM bool "Use register arguments (EXPERIMENTAL)" - depends on EXPERIMENTAL + depends on EXPERIMENTAL && !MCOUNT default n help Compile the kernel with -mregparm=3. This uses a different ABI Index: linux/arch/i386/Kconfig.debug =================================================================== --- linux.orig/arch/i386/Kconfig.debug +++ linux/arch/i386/Kconfig.debug @@ -18,6 +18,7 @@ config EARLY_PRINTK config DEBUG_STACKOVERFLOW bool "Check for stack overflows" depends on DEBUG_KERNEL + default y help This option will cause messages to be printed if free stack space drops below a certain limit. @@ -35,6 +36,7 @@ config KPROBES config DEBUG_STACK_USAGE bool "Stack utilization instrumentation" depends on DEBUG_KERNEL + default y help Enables the display of the minimum amount of free stack which each task has ever had available in the sysrq-T and sysrq-P debug output. Index: linux/arch/i386/boot/compressed/misc.c =================================================================== --- linux.orig/arch/i386/boot/compressed/misc.c +++ linux/arch/i386/boot/compressed/misc.c @@ -15,6 +15,12 @@ #include #include +#ifdef CONFIG_MCOUNT +void notrace mcount(void) +{ +} +#endif + /* * gzip declarations */ @@ -112,7 +118,7 @@ static long free_mem_end_ptr; #define INPLACE_MOVE_ROUTINE 0x1000 #define LOW_BUFFER_START 0x2000 #define LOW_BUFFER_MAX 0x90000 -#define HEAP_SIZE 0x3000 +#define HEAP_SIZE 0x4000 static unsigned int low_buffer_end, low_buffer_size; static int high_loaded =0; static uch *high_buffer_start /* = (uch *)(((ulg)&end) + HEAP_SIZE)*/; Index: linux/arch/i386/kernel/Makefile =================================================================== --- linux.orig/arch/i386/kernel/Makefile +++ linux/arch/i386/kernel/Makefile @@ -4,11 +4,12 @@ extra-y := head.o init_task.o vmlinux.lds -obj-y := process.o semaphore.o signal.o entry.o traps.o irq.o vm86.o \ +obj-y := process.o signal.o entry.o traps.o irq.o vm86.o \ ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_i386.o \ pci-dma.o i386_ksyms.o i387.o dmi_scan.o bootflag.o \ doublefault.o quirks.o +obj-$(CONFIG_ASM_SEMAPHORES) += semaphore.o obj-y += cpu/ obj-y += timers/ obj-$(CONFIG_ACPI_BOOT) += acpi/ @@ -20,6 +21,7 @@ obj-$(CONFIG_MICROCODE) += microcode.o obj-$(CONFIG_APM) += apm.o obj-$(CONFIG_X86_SMP) += smp.o smpboot.o obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o +obj-$(CONFIG_MCOUNT) += mcount-wrapper.o obj-$(CONFIG_X86_MPPARSE) += mpparse.o obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o obj-$(CONFIG_X86_IO_APIC) += io_apic.o Index: linux/arch/i386/kernel/apic.c =================================================================== --- linux.orig/arch/i386/kernel/apic.c +++ linux/arch/i386/kernel/apic.c @@ -36,11 +36,18 @@ #include #include #include +#include #include #include "io_ports.h" +#ifndef CONFIG_HIGH_RES_TIMERS +#define compute_latch(a) +#else +extern void apic_timer_ipi_interrupt(struct pt_regs regs); +#endif + /* * Knob to control our willingness to enable the local APIC. */ @@ -76,6 +83,13 @@ void __init apic_intr_init(void) { #ifdef CONFIG_SMP smp_intr_init(); +#ifdef CONFIG_HIGH_RES_TIMERS + /* + * We let the PIT timer interrupt trigger all CPU's + * for profiling and update_system_time + */ + set_intr_gate(LOCAL_TIMER_IPI_VECTOR, apic_timer_ipi_interrupt); +#endif #endif /* self generated IPI for local APIC timer */ set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); @@ -90,12 +104,11 @@ void __init apic_intr_init(void) #endif } -/* Using APIC to generate smp_local_timer_interrupt? */ -int using_apic_timer = 0; - +#ifndef CONFIG_HIGH_RES_TIMERS static DEFINE_PER_CPU(int, prof_multiplier) = 1; static DEFINE_PER_CPU(int, prof_old_multiplier) = 1; static DEFINE_PER_CPU(int, prof_counter) = 1; +#endif static int enabled_via_apicbase; @@ -566,9 +579,9 @@ void lapic_shutdown(void) if (!cpu_has_apic || !enabled_via_apicbase) return; - local_irq_disable(); + raw_local_irq_disable(); disable_local_APIC(); - local_irq_enable(); + raw_local_irq_enable(); } #ifdef CONFIG_PM @@ -612,9 +625,9 @@ static int lapic_suspend(struct sys_devi apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); - local_irq_save(flags); + raw_local_irq_save(flags); disable_local_APIC(); - local_irq_restore(flags); + raw_local_irq_restore(flags); return 0; } @@ -626,7 +639,7 @@ static int lapic_resume(struct sys_devic if (!apic_pm_state.active) return 0; - local_irq_save(flags); + raw_local_irq_save(flags); /* * Make sure the APICBASE points to the right address @@ -657,7 +670,7 @@ static int lapic_resume(struct sys_devic apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr); apic_write(APIC_ESR, 0); apic_read(APIC_ESR); - local_irq_restore(flags); + raw_local_irq_restore(flags); return 0; } @@ -850,10 +863,10 @@ fake_ioapic_page: ioapic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE); ioapic_phys = __pa(ioapic_phys); + set_fixmap_nocache(idx, ioapic_phys); + printk(KERN_DEBUG "faked IOAPIC to %08lx (%08lx)\n", + __fix_to_virt(idx), ioapic_phys); } - set_fixmap_nocache(idx, ioapic_phys); - printk(KERN_DEBUG "mapped IOAPIC to %08lx (%08lx)\n", - __fix_to_virt(idx), ioapic_phys); idx++; } } @@ -926,13 +939,24 @@ void (*wait_timer_tick)(void) __devinitd */ #define APIC_DIVISOR 16 - +/* + * For high res timers we want a single shot timer. + * This means, for profiling, that we must load it each + * interrupt, but it works best for timers as a one shot and + * it is little overhead for the profiling which, we hope is + * not done that often, nor on production machines. + */ static void __setup_APIC_LVTT(unsigned int clocks) { unsigned int lvtt_value, tmp_value, ver; ver = GET_APIC_VERSION(apic_read(APIC_LVR)); - lvtt_value = APIC_LVT_TIMER_PERIODIC | LOCAL_TIMER_VECTOR; + lvtt_value = +#ifndef CONFIG_HIGH_RES_TIMERS + APIC_LVT_TIMER_PERIODIC | +#endif + LOCAL_TIMER_VECTOR; + if (!APIC_INTEGRATED(ver)) lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV); apic_write_around(APIC_LVTT, lvtt_value); @@ -952,7 +976,7 @@ static void __devinit setup_APIC_timer(u { unsigned long flags; - local_irq_save(flags); + raw_local_irq_save(flags); /* * Wait for IRQ0's slice: @@ -961,7 +985,7 @@ static void __devinit setup_APIC_timer(u __setup_APIC_LVTT(clocks); - local_irq_restore(flags); + raw_local_irq_restore(flags); } /* @@ -1050,7 +1074,7 @@ void __init setup_boot_APIC_clock(void) apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n"); using_apic_timer = 1; - local_irq_disable(); + raw_local_irq_disable(); calibration_result = calibrate_APIC_clock(); /* @@ -1058,7 +1082,9 @@ void __init setup_boot_APIC_clock(void) */ setup_APIC_timer(calibration_result); - local_irq_enable(); + compute_latch(calibration_result / APIC_DIVISOR); + + raw_local_irq_enable(); } void __devinit setup_secondary_APIC_clock(void) @@ -1086,6 +1112,87 @@ void enable_APIC_timer(void) } } + +/* High resolution timer specific quirks */ +#ifdef CONFIG_HIGH_RES_TIMERS + +#ifdef CONFIG_SMP +/* + * We use the lapic timer for hrt and therefor discard + * the variable profiling + */ +int setup_profiling_timer(unsigned int multiplier) +{ + return -EINVAL; +} + +/* + * This code ONLY takes IPI interrupts from the PIT interrupt handler + */ +fastcall notrace void smp_apic_timer_ipi_interrupt(struct pt_regs *regs) +{ + int cpu = smp_processor_id(); + + /* + * the NMI deadlock-detector uses this. + */ + per_cpu(irq_stat, cpu).apic_timer_irqs++; + + trace_special(regs->eip, 0, 0); + + /* + * NOTE! We'd better ACK the irq immediately, + * because timer handling can be slow. + */ + ack_APIC_irq(); + /* + * update_process_times() expects us to have done irq_enter(). + * Besides, if we don't timer interrupts ignore the global + * interrupt lock, which is the WrongThing (tm) to do. + */ + irq_enter(); + // we profile via the NMI +#if 0 + profile_tick(CPU_PROFILING, regs); +#endif + update_process_times(user_mode(regs)); + irq_exit(); + +} +#endif + +void reload_apic_timer(unsigned int clocks) +{ + apic_write_around(APIC_TMICT, clocks); +} + +/* + * We use the local apic timer for high resolution timer interrups + * This moves profiling and update process times into the PIT. + * + * We lose the variable multiplier of profiling, .... + * +*/ +inline void smp_local_timer_interrupt(struct pt_regs * regs) +{ + /* + * This is ugly, but we _must_ protect the hr interrupt code + * against corruption of jiffies because the !using_apic_timer + * path in do_timer_interrupt calls the hr interrupt code with + * xtime lock held too. We dont know at compile time whether + * we can use the local apic or not, so we have to do the + * locking here. + */ + write_seqlock(&xtime_lock); + if (highres_active) { + do_hr_timer_int(); + } + write_sequnlock(&xtime_lock); +} + +#else /* !HIGH_RES_TIMERS */ + +#ifdef CONFIG_SMP /* * the frequency of the profiling timer can be changed * by writing a multiplier value into /proc/profile. @@ -1113,8 +1220,7 @@ int setup_profiling_timer(unsigned int m return 0; } - -#undef APIC_DIVISOR +#endif /* * Local timer interrupt handler. It does both profiling and @@ -1130,7 +1236,11 @@ inline void smp_local_timer_interrupt(st { int cpu = smp_processor_id(); + // we profile via the NMI +#if 0 profile_tick(CPU_PROFILING, regs); +#endif + if (--per_cpu(prof_counter, cpu) <= 0) { /* * The multiplier may have changed since the last time we got @@ -1149,7 +1259,6 @@ inline void smp_local_timer_interrupt(st per_cpu(prof_old_multiplier, cpu) = per_cpu(prof_counter, cpu); } - #ifdef CONFIG_SMP update_process_times(user_mode_vm(regs)); #endif @@ -1167,6 +1276,8 @@ inline void smp_local_timer_interrupt(st */ } +#endif /* !HIGH_RES_TIMERS */ + /* * Local APIC timer interrupt. This is the most natural way for doing * local interrupts, but local timer interrupts can be emulated by @@ -1176,7 +1287,7 @@ inline void smp_local_timer_interrupt(st * interrupt as well. Thus we cannot inline the local irq ... ] */ -fastcall void smp_apic_timer_interrupt(struct pt_regs *regs) +fastcall notrace void smp_apic_timer_interrupt(struct pt_regs *regs) { int cpu = smp_processor_id(); @@ -1185,6 +1296,8 @@ fastcall void smp_apic_timer_interrupt(s */ per_cpu(irq_stat, cpu).apic_timer_irqs++; + trace_special(regs->eip, 0, 0); + /* * NOTE! We'd better ACK the irq immediately, * because timer handling can be slow. Index: linux/arch/i386/kernel/apm.c =================================================================== --- linux.orig/arch/i386/kernel/apm.c +++ linux/arch/i386/kernel/apm.c @@ -552,9 +552,9 @@ static inline void apm_restore_cpus(cpum */ #define APM_DO_CLI \ if (apm_info.allow_ints) \ - local_irq_enable(); \ + raw_local_irq_enable(); \ else \ - local_irq_disable(); + raw_local_irq_disable(); #ifdef APM_ZERO_SEGS # define APM_DECL_SEGS \ @@ -604,12 +604,12 @@ static u8 apm_bios_call(u32 func, u32 eb save_desc_40 = per_cpu(cpu_gdt_table, cpu)[0x40 / 8]; per_cpu(cpu_gdt_table, cpu)[0x40 / 8] = bad_bios_desc; - local_save_flags(flags); + raw_local_save_flags(flags); APM_DO_CLI; APM_DO_SAVE_SEGS; apm_bios_call_asm(func, ebx_in, ecx_in, eax, ebx, ecx, edx, esi); APM_DO_RESTORE_SEGS; - local_irq_restore(flags); + raw_local_irq_restore(flags); per_cpu(cpu_gdt_table, cpu)[0x40 / 8] = save_desc_40; put_cpu(); apm_restore_cpus(cpus); @@ -647,12 +647,12 @@ static u8 apm_bios_call_simple(u32 func, save_desc_40 = per_cpu(cpu_gdt_table, cpu)[0x40 / 8]; per_cpu(cpu_gdt_table, cpu)[0x40 / 8] = bad_bios_desc; - local_save_flags(flags); + raw_local_save_flags(flags); APM_DO_CLI; APM_DO_SAVE_SEGS; error = apm_bios_call_simple_asm(func, ebx_in, ecx_in, eax); APM_DO_RESTORE_SEGS; - local_irq_restore(flags); + raw_local_irq_restore(flags); __get_cpu_var(cpu_gdt_table)[0x40 / 8] = save_desc_40; put_cpu(); apm_restore_cpus(cpus); @@ -1194,7 +1194,7 @@ static int suspend(int vetoable) } device_suspend(PMSG_SUSPEND); - local_irq_disable(); + raw_local_irq_disable(); device_power_down(PMSG_SUSPEND); /* serialize with the timer interrupt */ @@ -1210,14 +1210,14 @@ static int suspend(int vetoable) */ spin_unlock(&i8253_lock); write_sequnlock(&xtime_lock); - local_irq_enable(); + raw_local_irq_enable(); save_processor_state(); err = set_system_power_state(APM_STATE_SUSPEND); ignore_normal_resume = 1; restore_processor_state(); - local_irq_disable(); + raw_local_irq_disable(); write_seqlock(&xtime_lock); spin_lock(&i8253_lock); reinit_timer(); @@ -1232,7 +1232,7 @@ static int suspend(int vetoable) apm_error("suspend", err); err = (err == APM_SUCCESS) ? 0 : -EIO; device_power_up(); - local_irq_enable(); + raw_local_irq_enable(); device_resume(); pm_send_all(PM_RESUME, (void *)0); queue_event(APM_NORMAL_RESUME, NULL); @@ -1251,22 +1251,22 @@ static void standby(void) { int err; - local_irq_disable(); + raw_local_irq_disable(); device_power_down(PMSG_SUSPEND); /* serialize with the timer interrupt */ write_seqlock(&xtime_lock); /* If needed, notify drivers here */ get_time_diff(); write_sequnlock(&xtime_lock); - local_irq_enable(); + raw_local_irq_enable(); err = set_system_power_state(APM_STATE_STANDBY); if ((err != APM_SUCCESS) && (err != APM_NO_ERROR)) apm_error("standby", err); - local_irq_disable(); + raw_local_irq_disable(); device_power_up(); - local_irq_enable(); + raw_local_irq_enable(); } static apm_event_t get_event(void) Index: linux/arch/i386/kernel/cpu/mtrr/cyrix.c =================================================================== --- linux.orig/arch/i386/kernel/cpu/mtrr/cyrix.c +++ linux/arch/i386/kernel/cpu/mtrr/cyrix.c @@ -17,7 +17,7 @@ cyrix_get_arr(unsigned int reg, unsigned arr = CX86_ARR_BASE + (reg << 1) + reg; /* avoid multiplication by 3 */ /* Save flags and disable interrupts */ - local_irq_save(flags); + raw_local_irq_save(flags); ccr3 = getCx86(CX86_CCR3); setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ @@ -28,7 +28,7 @@ cyrix_get_arr(unsigned int reg, unsigned setCx86(CX86_CCR3, ccr3); /* disable MAPEN */ /* Enable interrupts if it was enabled previously */ - local_irq_restore(flags); + raw_local_irq_restore(flags); shift = ((unsigned char *) base)[1] & 0x0f; *base >>= PAGE_SHIFT; Index: linux/arch/i386/kernel/cpu/mtrr/generic.c =================================================================== --- linux.orig/arch/i386/kernel/cpu/mtrr/generic.c +++ linux/arch/i386/kernel/cpu/mtrr/generic.c @@ -234,7 +234,7 @@ static unsigned long set_mtrr_state(u32 static unsigned long cr4 = 0; static u32 deftype_lo, deftype_hi; -static DEFINE_SPINLOCK(set_atomicity_lock); +static DEFINE_RAW_SPINLOCK(set_atomicity_lock); /* * Since we are disabling the cache don't allow any interrupts - they @@ -296,14 +296,14 @@ static void generic_set_all(void) unsigned long mask, count; unsigned long flags; - local_irq_save(flags); + raw_local_irq_save(flags); prepare_set(); /* Actually set the state */ mask = set_mtrr_state(deftype_lo,deftype_hi); post_set(); - local_irq_restore(flags); + raw_local_irq_restore(flags); /* Use the atomic bitops to update the global mask */ for (count = 0; count < sizeof mask * 8; ++count) { @@ -331,7 +331,7 @@ static void generic_set_mtrr(unsigned in vr = &mtrr_state.var_ranges[reg]; - local_irq_save(flags); + raw_local_irq_save(flags); prepare_set(); if (size == 0) { @@ -350,7 +350,7 @@ static void generic_set_mtrr(unsigned in } post_set(); - local_irq_restore(flags); + raw_local_irq_restore(flags); } int generic_validate_add_page(unsigned long base, unsigned long size, unsigned int type) Index: linux/arch/i386/kernel/cpu/mtrr/main.c =================================================================== --- linux.orig/arch/i386/kernel/cpu/mtrr/main.c +++ linux/arch/i386/kernel/cpu/mtrr/main.c @@ -146,7 +146,7 @@ static void ipi_handler(void *info) struct set_mtrr_data *data = info; unsigned long flags; - local_irq_save(flags); + raw_local_irq_save(flags); atomic_dec(&data->count); while(!atomic_read(&data->gate)) @@ -164,7 +164,7 @@ static void ipi_handler(void *info) cpu_relax(); atomic_dec(&data->count); - local_irq_restore(flags); + raw_local_irq_restore(flags); } #endif @@ -225,7 +225,7 @@ static void set_mtrr(unsigned int reg, u if (smp_call_function(ipi_handler, &data, 1, 0) != 0) panic("mtrr: timed out waiting for other CPUs\n"); - local_irq_save(flags); + raw_local_irq_save(flags); while(atomic_read(&data.count)) cpu_relax(); @@ -259,7 +259,7 @@ static void set_mtrr(unsigned int reg, u while(atomic_read(&data.count)) cpu_relax(); - local_irq_restore(flags); + raw_local_irq_restore(flags); } /** @@ -687,11 +687,11 @@ void mtrr_ap_init(void) * 2.cpu hotadd time. We let mtrr_add/del_page hold cpuhotplug lock to * prevent mtrr entry changes */ - local_irq_save(flags); + raw_local_irq_save(flags); mtrr_if->set_all(); - local_irq_restore(flags); + raw_local_irq_restore(flags); } static int __init mtrr_init_finialize(void) Index: linux/arch/i386/kernel/cpu/mtrr/state.c =================================================================== --- linux.orig/arch/i386/kernel/cpu/mtrr/state.c +++ linux/arch/i386/kernel/cpu/mtrr/state.c @@ -12,7 +12,7 @@ void set_mtrr_prepare_save(struct set_mt unsigned int cr0; /* Disable interrupts locally */ - local_irq_save(ctxt->flags); + raw_local_irq_save(ctxt->flags); if (use_intel() || is_cpu(CYRIX)) { @@ -73,6 +73,6 @@ void set_mtrr_done(struct set_mtrr_conte write_cr4(ctxt->cr4val); } /* Re-enable interrupts locally (if enabled previously) */ - local_irq_restore(ctxt->flags); + raw_local_irq_restore(ctxt->flags); } Index: linux/arch/i386/kernel/entry.S =================================================================== --- linux.orig/arch/i386/kernel/entry.S +++ linux/arch/i386/kernel/entry.S @@ -76,10 +76,10 @@ NT_MASK = 0x00004000 VM_MASK = 0x00020000 #ifdef CONFIG_PREEMPT -#define preempt_stop cli +# define preempt_stop cli #else -#define preempt_stop -#define resume_kernel restore_nocheck +# define preempt_stop +# define resume_kernel restore_nocheck #endif #define SAVE_ALL \ @@ -160,14 +160,17 @@ ENTRY(resume_userspace) #ifdef CONFIG_PREEMPT ENTRY(resume_kernel) cli + cmpl $0, kernel_preemption + jz restore_nocheck cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ? jnz restore_nocheck need_resched: movl TI_flags(%ebp), %ecx # need_resched set ? testb $_TIF_NEED_RESCHED, %cl - jz restore_all + jz restore_nocheck testl $IF_MASK,EFLAGS(%esp) # interrupts off (exception path) ? - jz restore_all + jz restore_nocheck + cli call preempt_schedule_irq jmp need_resched #endif @@ -200,6 +203,11 @@ sysenter_past_esp: pushl %eax SAVE_ALL +#ifdef CONFIG_LATENCY_TRACE + pushl %edx; pushl %ecx; pushl %ebx; pushl %eax + call sys_call + popl %eax; popl %ebx; popl %ecx; popl %edx +#endif GET_THREAD_INFO(%ebp) /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ @@ -213,6 +221,11 @@ sysenter_past_esp: movl TI_flags(%ebp), %ecx testw $_TIF_ALLWORK_MASK, %cx jne syscall_exit_work +#ifdef CONFIG_LATENCY_TRACE + pushl %eax + call sys_ret + popl %eax +#endif /* if something modifies registers it must also disable sysexit */ movl EIP(%esp), %edx movl OLDESP(%esp), %ecx @@ -225,6 +238,11 @@ sysenter_past_esp: ENTRY(system_call) pushl %eax # save orig_eax SAVE_ALL +#ifdef CONFIG_LATENCY_TRACE + pushl %edx; pushl %ecx; pushl %ebx; pushl %eax + call sys_call + popl %eax; popl %ebx; popl %ecx; popl %edx +#endif GET_THREAD_INFO(%ebp) # system call tracing in operation /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ @@ -254,6 +272,17 @@ restore_all: cmpl $((4 << 8) | 3), %eax je ldt_ss # returning to user-space with LDT SS restore_nocheck: +#if defined(CONFIG_CRITICAL_IRQSOFF_TIMING) || defined(CONFIG_LATENCY_TRACE) + pushl %eax +#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING + call trace_irqs_on +#endif +#ifdef CONFIG_LATENCY_TRACE + call sys_ret +#endif + popl %eax +#endif +restore_nocheck_nmi: RESTORE_REGS addl $4, %esp 1: iret @@ -297,18 +326,19 @@ ldt_ss: # perform work that needs to be done immediately before resumption ALIGN work_pending: - testb $_TIF_NEED_RESCHED, %cl + testb $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED), %cl jz work_notifysig work_resched: - call schedule - cli # make sure we don't miss an interrupt + cli + call __schedule + # make sure we don't miss an interrupt # setting need_resched or sigpending # between sampling and the iret movl TI_flags(%ebp), %ecx andl $_TIF_WORK_MASK, %ecx # is there any work to be done other # than syscall tracing? jz restore_all - testb $_TIF_NEED_RESCHED, %cl + testb $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED), %cl jnz work_resched work_notifysig: # deal with pending signals and @@ -348,6 +378,11 @@ syscall_trace_entry: syscall_exit_work: testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl jz work_pending +#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING + pushl %eax + call trace_irqs_on + popl %eax +#endif sti # could let do_syscall_trace() call # schedule() instead movl %esp, %eax @@ -409,9 +444,16 @@ ENTRY(irq_entries_start) vector=vector+1 .endr +#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING +# define TRACE_IRQS_OFF call trace_irqs_off_lowlevel; +#else +# define TRACE_IRQS_OFF +#endif + ALIGN common_interrupt: SAVE_ALL + TRACE_IRQS_OFF movl %esp,%eax call do_IRQ jmp ret_from_intr @@ -420,6 +462,7 @@ common_interrupt: ENTRY(name) \ pushl $nr-256; \ SAVE_ALL \ + TRACE_IRQS_OFF \ movl %esp,%eax; \ call smp_/**/name; \ jmp ret_from_intr; @@ -549,7 +592,7 @@ nmi_stack_correct: xorl %edx,%edx # zero error code movl %esp,%eax # pt_regs pointer call do_nmi - jmp restore_all + jmp restore_nocheck_nmi nmi_stack_fixup: FIX_STACK(12,nmi_stack_correct, 1) Index: linux/arch/i386/kernel/i386_ksyms.c =================================================================== --- linux.orig/arch/i386/kernel/i386_ksyms.c +++ linux/arch/i386/kernel/i386_ksyms.c @@ -6,10 +6,12 @@ /* This is definitely a GPL-only symbol */ EXPORT_SYMBOL_GPL(cpu_gdt_table); -EXPORT_SYMBOL(__down_failed); -EXPORT_SYMBOL(__down_failed_interruptible); -EXPORT_SYMBOL(__down_failed_trylock); -EXPORT_SYMBOL(__up_wakeup); +#ifdef CONFIG_ASM_SEMAPHORES +EXPORT_SYMBOL(__compat_down_failed); +EXPORT_SYMBOL(__compat_down_failed_interruptible); +EXPORT_SYMBOL(__compat_down_failed_trylock); +EXPORT_SYMBOL(__compat_up_wakeup); +#endif /* Networking helper routines. */ EXPORT_SYMBOL(csum_partial_copy_generic); @@ -25,7 +27,7 @@ EXPORT_SYMBOL(__put_user_8); EXPORT_SYMBOL(strpbrk); EXPORT_SYMBOL(strstr); -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) && defined(CONFIG_ASM_SEMAPHORES) extern void FASTCALL( __write_lock_failed(rwlock_t *rw)); extern void FASTCALL( __read_lock_failed(rwlock_t *rw)); EXPORT_SYMBOL(__write_lock_failed); Index: linux/arch/i386/kernel/i8259.c =================================================================== --- linux.orig/arch/i386/kernel/i8259.c +++ linux/arch/i386/kernel/i8259.c @@ -38,7 +38,7 @@ * moves to arch independent land */ -DEFINE_SPINLOCK(i8259A_lock); +DEFINE_RAW_SPINLOCK(i8259A_lock); static void end_8259A_irq (unsigned int irq) { @@ -369,7 +369,7 @@ static irqreturn_t math_error_irq(int cp * New motherboards sometimes make IRQ 13 be a PCI interrupt, * so allow interrupt sharing. */ -static struct irqaction fpu_irq = { math_error_irq, 0, CPU_MASK_NONE, "fpu", NULL, NULL }; +static struct irqaction fpu_irq = { math_error_irq, SA_NODELAY, CPU_MASK_NONE, "fpu", NULL, NULL }; void __init init_ISA_irqs (void) { Index: linux/arch/i386/kernel/init_task.c =================================================================== --- linux.orig/arch/i386/kernel/init_task.c +++ linux/arch/i386/kernel/init_task.c @@ -10,8 +10,8 @@ #include #include -static struct fs_struct init_fs = INIT_FS; -static struct files_struct init_files = INIT_FILES; +static struct fs_struct init_fs = INIT_FS(init_fs); +static struct files_struct init_files = INIT_FILES(init_files); static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); struct mm_struct init_mm = INIT_MM(init_mm); Index: linux/arch/i386/kernel/io_apic.c =================================================================== --- linux.orig/arch/i386/kernel/io_apic.c +++ linux/arch/i386/kernel/io_apic.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include #include @@ -46,7 +47,7 @@ int (*ioapic_renumber_irq)(int ioapic, int irq); atomic_t irq_mis_count; -static DEFINE_SPINLOCK(ioapic_lock); +static DEFINE_RAW_SPINLOCK(ioapic_lock); /* * Is the SiS APIC rmw bug present ? @@ -55,11 +56,6 @@ static DEFINE_SPINLOCK(ioapic_lock); int sis_apic_bug = -1; /* - * # of IRQ routing registers - */ -int nr_ioapic_registers[MAX_IO_APICS]; - -/* * Rough estimation of how many shared IRQs there are, can * be changed anytime. */ @@ -128,19 +124,131 @@ static void __init replace_pin_at_irq(un } } +#ifdef CONFIG_X86_IOAPIC_FAST +# define IOAPIC_CACHE +#endif + + + +struct ioapic_data_struct { + struct sys_device dev; + int nr_registers; // # of IRQ routing registers + volatile unsigned int *base; + struct IO_APIC_route_entry *entry; +#ifdef IOAPIC_CACHE + unsigned int reg_set; + u32 cached_val[0]; +#endif +}; + +static struct ioapic_data_struct *ioapic_data[MAX_IO_APICS]; + + +static inline unsigned int __raw_io_apic_read(struct ioapic_data_struct *ioapic, unsigned int reg) +{ +# ifdef IOAPIC_CACHE + ioapic->reg_set = reg; +# endif + ioapic->base[0] = reg; + return ioapic->base[4]; +} + + +# ifdef IOAPIC_CACHE +static void __init ioapic_cache_init(struct ioapic_data_struct *ioapic) +{ + int reg; + for (reg = 0; reg < (0x10 + 2 * ioapic->nr_registers); reg++) + ioapic->cached_val[reg] = __raw_io_apic_read(ioapic, reg); +} +# endif + + +static unsigned int raw_io_apic_read(struct ioapic_data_struct *ioapic, unsigned int reg) +{ + unsigned int val = __raw_io_apic_read(ioapic, reg); + +# ifdef IOAPIC_CACHE + ioapic->cached_val[reg] = val; +# endif + return val; +} + +static unsigned int io_apic_read(struct ioapic_data_struct *ioapic, unsigned int reg) +{ +# ifdef IOAPIC_CACHE + if (likely(!sis_apic_bug)) { + ioapic->reg_set = -1; + return ioapic->cached_val[reg]; + } +# endif + return raw_io_apic_read(ioapic, reg); +} + +static void io_apic_write(struct ioapic_data_struct *ioapic, unsigned int reg, unsigned int val) +{ +# ifdef IOAPIC_CACHE + ioapic->cached_val[reg] = val; + ioapic->reg_set = reg; +# endif + ioapic->base[0] = reg; + ioapic->base[4] = val; +} + + +/* + * Some systems need a POST flush or else level-triggered interrupts + * generate lots of spurious interrupts due to the POST-ed write not + * reaching the IOAPIC before the IRQ is ACK-ed in the local APIC. + * + * It seems most systems need this - disable the optimization for now. + */ +#ifndef CONFIG_X86_IOAPIC_FAST +# define IOAPIC_POSTFLUSH +#endif + +/* + * Re-write a value: to be used for read-modify-write + * cycles where the read already set up the index register. + * + * Older SiS APIC requires we rewrite the index regiser + */ +static void io_apic_modify(struct ioapic_data_struct *ioapic, unsigned int reg, unsigned int val) +{ +#ifdef IOAPIC_CACHE + ioapic->cached_val[reg] = val; + if (ioapic->reg_set != reg || sis_apic_bug) { + ioapic->reg_set = reg; +#else + if (unlikely(sis_apic_bug)) { +#endif + ioapic->base[0] = reg; + } + ioapic->base[4] = val; +#ifndef IOAPIC_POSTFLUSH + if (unlikely(sis_apic_bug)) +#endif + /* + * Force POST flush by reading: + */ + val = ioapic->base[4]; +} + static void __modify_IO_APIC_irq (unsigned int irq, unsigned long enable, unsigned long disable) { struct irq_pin_list *entry = irq_2_pin + irq; - unsigned int pin, reg; + unsigned int pin, val; + struct ioapic_data_struct *ioapic; for (;;) { pin = entry->pin; if (pin == -1) break; - reg = io_apic_read(entry->apic, 0x10 + pin*2); - reg &= ~disable; - reg |= enable; - io_apic_modify(entry->apic, 0x10 + pin*2, reg); + ioapic = ioapic_data[entry->apic]; + val = io_apic_read(ioapic, 0x10 + pin*2); + val &= ~disable; + val |= enable; + io_apic_modify(ioapic, 0x10 + pin*2, val); if (!entry->next) break; entry = irq_2_pin + entry->next; @@ -148,29 +256,17 @@ static void __modify_IO_APIC_irq (unsign } /* mask = 1 */ -static void __mask_IO_APIC_irq (unsigned int irq) +static inline void __mask_IO_APIC_irq (unsigned int irq) { __modify_IO_APIC_irq(irq, 0x00010000, 0); } /* mask = 0 */ -static void __unmask_IO_APIC_irq (unsigned int irq) +static inline void __unmask_IO_APIC_irq (unsigned int irq) { __modify_IO_APIC_irq(irq, 0, 0x00010000); } -/* mask = 1, trigger = 0 */ -static void __mask_and_edge_IO_APIC_irq (unsigned int irq) -{ - __modify_IO_APIC_irq(irq, 0x00010000, 0x00008000); -} - -/* mask = 0, trigger = 1 */ -static void __unmask_and_level_IO_APIC_irq (unsigned int irq) -{ - __modify_IO_APIC_irq(irq, 0x00008000, 0x00010000); -} - static void mask_IO_APIC_irq (unsigned int irq) { unsigned long flags; @@ -189,15 +285,15 @@ static void unmask_IO_APIC_irq (unsigned spin_unlock_irqrestore(&ioapic_lock, flags); } -static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) +static void clear_IO_APIC_pin(struct ioapic_data_struct *ioapic, unsigned int pin) { struct IO_APIC_route_entry entry; unsigned long flags; /* Check delivery_mode to be sure we're not clearing an SMI pin */ spin_lock_irqsave(&ioapic_lock, flags); - *(((int*)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin); - *(((int*)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin); + *(((int*)&entry) + 0) = io_apic_read(ioapic, 0x10 + 2 * pin); + *(((int*)&entry) + 1) = io_apic_read(ioapic, 0x11 + 2 * pin); spin_unlock_irqrestore(&ioapic_lock, flags); if (entry.delivery_mode == dest_SMI) return; @@ -208,8 +304,8 @@ static void clear_IO_APIC_pin(unsigned i memset(&entry, 0, sizeof(entry)); entry.mask = 1; spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry) + 0)); - io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry) + 1)); + io_apic_write(ioapic, 0x10 + 2 * pin, *(((int *)&entry) + 0)); + io_apic_write(ioapic, 0x11 + 2 * pin, *(((int *)&entry) + 1)); spin_unlock_irqrestore(&ioapic_lock, flags); } @@ -217,9 +313,14 @@ static void clear_IO_APIC (void) { int apic, pin; - for (apic = 0; apic < nr_ioapics; apic++) - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) - clear_IO_APIC_pin(apic, pin); + for (apic = 0; apic < nr_ioapics; apic++) { + struct ioapic_data_struct *ioapic = ioapic_data[apic]; +#ifdef IOAPIC_CACHE + ioapic->reg_set = -1; +#endif + for (pin = 0; pin < ioapic->nr_registers; pin++) + clear_IO_APIC_pin(ioapic, pin); + } } static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask) @@ -237,7 +338,7 @@ static void set_ioapic_affinity_irq(unsi pin = entry->pin; if (pin == -1) break; - io_apic_write(entry->apic, 0x10 + 1 + pin*2, apicid_value); + io_apic_write(ioapic_data[entry->apic], 0x10 + 1 + pin*2, apicid_value); if (!entry->next) break; entry = irq_2_pin + entry->next; @@ -828,7 +929,7 @@ void __init setup_ioapic_dest(void) return; for (ioapic = 0; ioapic < nr_ioapics; ioapic++) { - for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) { + for (pin = 0; pin < ioapic_data[ioapic]->nr_registers; pin++) { irq_entry = find_irq_entry(ioapic, pin, mp_INT); if (irq_entry == -1) continue; @@ -1071,7 +1172,7 @@ static int pin_2_irq(int idx, int apic, */ i = irq = 0; while (i < apic) - irq += nr_ioapic_registers[i++]; + irq += ioapic_data[i++]->nr_registers; irq += pin; /* @@ -1114,7 +1215,7 @@ static inline int IO_APIC_irq_trigger(in int apic, idx, pin; for (apic = 0; apic < nr_ioapics; apic++) { - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { + for (pin = 0; pin < ioapic_data[apic]->nr_registers; pin++) { idx = find_irq_entry(apic,pin,mp_INT); if ((idx != -1) && (irq == pin_2_irq(idx,apic,pin))) return irq_trigger(idx); @@ -1186,11 +1287,13 @@ static void __init setup_IO_APIC_irqs(vo struct IO_APIC_route_entry entry; int apic, pin, idx, irq, first_notcon = 1, vector; unsigned long flags; + struct ioapic_data_struct *ioapic; apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); for (apic = 0; apic < nr_ioapics; apic++) { - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { + ioapic = ioapic_data[apic]; + for (pin = 0; pin < ioapic->nr_registers; pin++) { /* * add it to the IO-APIC irq-routing table: @@ -1247,8 +1350,8 @@ static void __init setup_IO_APIC_irqs(vo disable_8259A_irq(irq); } spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1)); - io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0)); + io_apic_write(ioapic, 0x11+2*pin, *(((int *)&entry)+1)); + io_apic_write(ioapic, 0x10+2*pin, *(((int *)&entry)+0)); spin_unlock_irqrestore(&ioapic_lock, flags); } } @@ -1294,8 +1397,8 @@ static void __init setup_ExtINT_IRQ0_pin * Add it to the IO-APIC irq-routing table: */ spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(0, 0x11+2*pin, *(((int *)&entry)+1)); - io_apic_write(0, 0x10+2*pin, *(((int *)&entry)+0)); + io_apic_write(ioapic_data[0], 0x11+2*pin, *(((int *)&entry)+1)); + io_apic_write(ioapic_data[0], 0x10+2*pin, *(((int *)&entry)+0)); spin_unlock_irqrestore(&ioapic_lock, flags); enable_8259A_irq(0); @@ -1305,7 +1408,7 @@ static inline void UNEXPECTED_IO_APIC(vo { } -void __init print_IO_APIC(void) +void /*__init*/ print_IO_APIC(void) { int apic, i; union IO_APIC_reg_00 reg_00; @@ -1313,6 +1416,7 @@ void __init print_IO_APIC(void) union IO_APIC_reg_02 reg_02; union IO_APIC_reg_03 reg_03; unsigned long flags; + struct ioapic_data_struct *ioapic; if (apic_verbosity == APIC_QUIET) return; @@ -1320,7 +1424,7 @@ void __init print_IO_APIC(void) printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); for (i = 0; i < nr_ioapics; i++) printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", - mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]); + mp_ioapics[i].mpc_apicid, ioapic_data[i]->nr_registers); /* * We are a bit conservative about what we expect. We have to @@ -1329,14 +1433,14 @@ void __init print_IO_APIC(void) printk(KERN_INFO "testing the IO APIC.......................\n"); for (apic = 0; apic < nr_ioapics; apic++) { - + ioapic = ioapic_data[apic]; spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(apic, 0); - reg_01.raw = io_apic_read(apic, 1); + reg_00.raw = io_apic_read(ioapic, 0); + reg_01.raw = io_apic_read(ioapic, 1); if (reg_01.bits.version >= 0x10) - reg_02.raw = io_apic_read(apic, 2); + reg_02.raw = io_apic_read(ioapic, 2); if (reg_01.bits.version >= 0x20) - reg_03.raw = io_apic_read(apic, 3); + reg_03.raw = io_apic_read(ioapic, 3); spin_unlock_irqrestore(&ioapic_lock, flags); printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid); @@ -1407,8 +1511,8 @@ void __init print_IO_APIC(void) struct IO_APIC_route_entry entry; spin_lock_irqsave(&ioapic_lock, flags); - *(((int *)&entry)+0) = io_apic_read(apic, 0x10+i*2); - *(((int *)&entry)+1) = io_apic_read(apic, 0x11+i*2); + *(((int *)&entry)+0) = raw_io_apic_read(ioapic, 0x10+i*2); + *(((int *)&entry)+1) = raw_io_apic_read(ioapic, 0x11+i*2); spin_unlock_irqrestore(&ioapic_lock, flags); printk(KERN_DEBUG " %02x %03X %02X ", @@ -1454,7 +1558,7 @@ void __init print_IO_APIC(void) return; } -#if 0 +#if 1 static void print_APIC_bitfield (int base) { @@ -1601,9 +1705,7 @@ void /*__init*/ print_PIC(void) static void __init enable_IO_APIC(void) { - union IO_APIC_reg_01 reg_01; int i; - unsigned long flags; for (i = 0; i < PIN_MAP_SIZE; i++) { irq_2_pin[i].pin = -1; @@ -1614,16 +1716,6 @@ static void __init enable_IO_APIC(void) pirq_entries[i] = -1; /* - * The number of IO-APIC IRQ registers (== #pins): - */ - for (i = 0; i < nr_ioapics; i++) { - spin_lock_irqsave(&ioapic_lock, flags); - reg_01.raw = io_apic_read(i, 1); - spin_unlock_irqrestore(&ioapic_lock, flags); - nr_ioapic_registers[i] = reg_01.bits.entries+1; - } - - /* * Do not trust the IO-APIC being empty at bootup */ clear_IO_APIC(); @@ -1666,8 +1758,7 @@ void disable_IO_APIC(void) * Add it to the IO-APIC irq-routing table: */ spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(0, 0x11+2*pin, *(((int *)&entry)+1)); - io_apic_write(0, 0x10+2*pin, *(((int *)&entry)+0)); + io_apic_write(ioapic_data[0], 0x11+2*pin, *(((int *)&entry)+1)); io_apic_write(ioapic_data[0], 0x10+2*pin, *(((int *)&entry)+0)); spin_unlock_irqrestore(&ioapic_lock, flags); } disconnect_bsp_APIC(pin != -1); @@ -1689,6 +1780,7 @@ static void __init setup_ioapic_ids_from int i; unsigned char old_id; unsigned long flags; + struct ioapic_data_struct *ioapic; /* * Don't check I/O APIC IDs for xAPIC systems. They have @@ -1706,10 +1798,10 @@ static void __init setup_ioapic_ids_from * Set the IOAPIC ID to the value stored in the MPC table. */ for (apic = 0; apic < nr_ioapics; apic++) { - + ioapic = ioapic_data[apic]; /* Read the register 0 value */ spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(apic, 0); + reg_00.raw = io_apic_read(ioapic, 0); spin_unlock_irqrestore(&ioapic_lock, flags); old_id = mp_ioapics[apic].mpc_apicid; @@ -1770,14 +1862,14 @@ static void __init setup_ioapic_ids_from reg_00.bits.ID = mp_ioapics[apic].mpc_apicid; spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(apic, 0, reg_00.raw); + io_apic_write(ioapic, 0, reg_00.raw); spin_unlock_irqrestore(&ioapic_lock, flags); /* * Sanity check */ spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(apic, 0); + reg_00.raw = io_apic_read(ioapic, 0); spin_unlock_irqrestore(&ioapic_lock, flags); if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid) printk("could not set ID!\n"); @@ -1801,7 +1893,7 @@ static int __init timer_irq_works(void) { unsigned long t1 = jiffies; - local_irq_enable(); + raw_local_irq_enable(); /* Let ten ticks pass... */ mdelay((10 * 1000) / HZ); @@ -1865,9 +1957,11 @@ static unsigned int startup_edge_ioapic_ static void ack_edge_ioapic_irq(unsigned int irq) { move_irq(irq); +#if 0 if ((irq_desc[irq].status & (IRQ_PENDING | IRQ_DISABLED)) == (IRQ_PENDING | IRQ_DISABLED)) mask_IO_APIC_irq(irq); +#endif ack_APIC_irq(); } @@ -1892,6 +1986,30 @@ static unsigned int startup_level_ioapic return 0; /* don't check for pending */ } +#ifdef CONFIG_PREEMPT_HARDIRQS + +/* + * in the PREEMPT_HARDIRQS case we dont want to keep the local + * APIC unacked, because the prevents further interrupts from + * being handled - and with IRQ threads being delayed arbitrarily, + * that's unacceptable. So we first mask the IRQ, then ack it. + * The hardirq thread will then unmask it. + */ +static void mask_and_ack_level_ioapic_irq(unsigned int irq) +{ + move_irq(irq); + mask_IO_APIC_irq(irq); + ack_APIC_irq(); +} + +#else + +static void mask_and_ack_level_ioapic_irq(unsigned int irq) +{ +} + +#endif + static void end_level_ioapic_irq (unsigned int irq) { unsigned long v; @@ -1926,8 +2044,10 @@ static void end_level_ioapic_irq (unsign if (!(v & (1 << (i & 0x1f)))) { atomic_inc(&irq_mis_count); spin_lock(&ioapic_lock); - __mask_and_edge_IO_APIC_irq(irq); - __unmask_and_level_IO_APIC_irq(irq); + /* mask = 1, trigger = 0 */ + __modify_IO_APIC_irq(irq, 0x00010000, 0x00008000); + /* mask = 0, trigger = 1 */ + __modify_IO_APIC_irq(irq, 0x00008000, 0x00010000); spin_unlock(&ioapic_lock); } } @@ -1954,6 +2074,13 @@ static unsigned int startup_level_ioapic return startup_level_ioapic_irq (irq); } +static void mask_and_ack_level_ioapic_vector (unsigned int vector) +{ + int irq = vector_to_irq(vector); + + mask_and_ack_level_ioapic_irq(irq); +} + static void end_level_ioapic_vector (unsigned int vector) { int irq = vector_to_irq(vector); @@ -1993,25 +2120,25 @@ static void set_ioapic_affinity_vector ( * races. */ static struct hw_interrupt_type ioapic_edge_type = { - .typename = "IO-APIC-edge", + .typename = "IO-APIC-edge", .startup = startup_edge_ioapic, .shutdown = shutdown_edge_ioapic, .enable = enable_edge_ioapic, .disable = disable_edge_ioapic, .ack = ack_edge_ioapic, .end = end_edge_ioapic, - .set_affinity = set_ioapic_affinity, + .set_affinity = set_ioapic_affinity, }; static struct hw_interrupt_type ioapic_level_type = { - .typename = "IO-APIC-level", + .typename = "IO-APIC-level", .startup = startup_level_ioapic, .shutdown = shutdown_level_ioapic, .enable = enable_level_ioapic, .disable = disable_level_ioapic, .ack = mask_and_ack_level_ioapic, .end = end_level_ioapic, - .set_affinity = set_ioapic_affinity, + .set_affinity = set_ioapic_affinity, }; static inline void init_IO_APIC_traps(void) @@ -2075,13 +2202,13 @@ static void ack_lapic_irq (unsigned int static void end_lapic_irq (unsigned int i) { /* nothing */ } static struct hw_interrupt_type lapic_irq_type = { - .typename = "local-APIC-edge", - .startup = NULL, /* startup_irq() not used for IRQ0 */ - .shutdown = NULL, /* shutdown_irq() not used for IRQ0 */ - .enable = enable_lapic_irq, - .disable = disable_lapic_irq, - .ack = ack_lapic_irq, - .end = end_lapic_irq + .typename = "local-APIC-edge", + .startup = NULL, /* startup_irq() not used for IRQ0 */ + .shutdown = NULL, /* shutdown_irq() not used for IRQ0 */ + .enable = enable_lapic_irq, + .disable = disable_lapic_irq, + .ack = ack_lapic_irq, + .end = end_lapic_irq }; static void setup_nmi (void) @@ -2109,22 +2236,23 @@ static void setup_nmi (void) * cycles as some i82489DX-based boards have glue logic that keeps the * 8259A interrupt line asserted until INTA. --macro */ -static inline void unlock_ExtINT_logic(void) +static void __init unlock_ExtINT_logic(void) { int pin, i; struct IO_APIC_route_entry entry0, entry1; unsigned char save_control, save_freq_select; unsigned long flags; + struct ioapic_data_struct *ioapic0 = ioapic_data[0]; pin = find_isa_irq_pin(8, mp_INT); if (pin == -1) return; spin_lock_irqsave(&ioapic_lock, flags); - *(((int *)&entry0) + 1) = io_apic_read(0, 0x11 + 2 * pin); - *(((int *)&entry0) + 0) = io_apic_read(0, 0x10 + 2 * pin); + *(((int *)&entry0) + 1) = io_apic_read(ioapic0, 0x11 + 2 * pin); + *(((int *)&entry0) + 0) = io_apic_read(ioapic0, 0x10 + 2 * pin); spin_unlock_irqrestore(&ioapic_lock, flags); - clear_IO_APIC_pin(0, pin); + clear_IO_APIC_pin(ioapic0, pin); memset(&entry1, 0, sizeof(entry1)); @@ -2137,8 +2265,8 @@ static inline void unlock_ExtINT_logic(v entry1.vector = 0; spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(0, 0x11 + 2 * pin, *(((int *)&entry1) + 1)); - io_apic_write(0, 0x10 + 2 * pin, *(((int *)&entry1) + 0)); + io_apic_write(ioapic0, 0x11 + 2 * pin, *(((int *)&entry1) + 1)); + io_apic_write(ioapic0, 0x10 + 2 * pin, *(((int *)&entry1) + 0)); spin_unlock_irqrestore(&ioapic_lock, flags); save_control = CMOS_READ(RTC_CONTROL); @@ -2156,11 +2284,11 @@ static inline void unlock_ExtINT_logic(v CMOS_WRITE(save_control, RTC_CONTROL); CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT); - clear_IO_APIC_pin(0, pin); + clear_IO_APIC_pin(ioapic0, pin); spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(0, 0x11 + 2 * pin, *(((int *)&entry0) + 1)); - io_apic_write(0, 0x10 + 2 * pin, *(((int *)&entry0) + 0)); + io_apic_write(ioapic0, 0x11 + 2 * pin, *(((int *)&entry0) + 1)); + io_apic_write(ioapic0, 0x10 + 2 * pin, *(((int *)&entry0) + 0)); spin_unlock_irqrestore(&ioapic_lock, flags); } @@ -2170,10 +2298,11 @@ static inline void unlock_ExtINT_logic(v * is so screwy. Thanks to Brian Perkins for testing/hacking this beast * fanatically on his truly buggy board. */ -static inline void check_timer(void) +static void __init check_timer(void) { int pin1, pin2; int vector; + struct ioapic_data_struct *ioapic0 = ioapic_data[0]; /* * get/set the timer IRQ vector: @@ -2191,7 +2320,10 @@ static inline void check_timer(void) */ apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); init_8259A(1); - timer_ack = 1; +#ifdef CONFIG_PREEMPT_RT + if (nmi_watchdog) +#endif + timer_ack = 1; enable_8259A_irq(0); pin1 = find_isa_irq_pin(0, mp_INT); @@ -2212,7 +2344,7 @@ static inline void check_timer(void) } return; } - clear_IO_APIC_pin(0, pin1); + clear_IO_APIC_pin(ioapic0, pin1); printk(KERN_ERR "..MP-BIOS bug: 8254 timer not connected to IO-APIC\n"); } @@ -2237,7 +2369,7 @@ static inline void check_timer(void) /* * Cleanup, just in case ... */ - clear_IO_APIC_pin(0, pin2); + clear_IO_APIC_pin(ioapic0, pin2); } printk(" failed.\n"); @@ -2278,6 +2410,46 @@ static inline void check_timer(void) "report. Then try booting with the 'noapic' option"); } +void __init setup_IO_APIC_early(int _ioapic) +{ + union IO_APIC_reg_01 reg_01; + unsigned long flags; + int size, nr_ioapic_registers; + volatile int *ioapic; + if (ioapic_data[_ioapic]) { + printk("been in %s before !!!!!\n", __FUNCTION__); + return; + } + + set_fixmap_nocache(FIX_IO_APIC_BASE_0 + _ioapic, mp_ioapics[_ioapic].mpc_apicaddr); + printk(KERN_DEBUG "mapped IOAPIC to %08lx (%08lx)\n", + __fix_to_virt(FIX_IO_APIC_BASE_0 + _ioapic), mp_ioapics[_ioapic].mpc_apicaddr); + /* + * The number of IO-APIC IRQ registers (== #pins): + */ + ioapic = IO_APIC_BASE(_ioapic); + spin_lock_irqsave(&ioapic_lock, flags); + ioapic[0] = 1; + reg_01.raw = ioapic[4]; + spin_unlock_irqrestore(&ioapic_lock, flags); + nr_ioapic_registers = reg_01.bits.entries+1; + + /* + * Initialsize ioapic_data struct: + */ + size = sizeof(struct ioapic_data_struct); +#ifdef IOAPIC_CACHE + size += 0x10 * sizeof(u32) + nr_ioapic_registers * sizeof(struct IO_APIC_route_entry); +#endif + ioapic_data[_ioapic] = alloc_bootmem(size); + memset(ioapic_data[_ioapic], 0, size); + ioapic_data[_ioapic]->nr_registers = nr_ioapic_registers; + ioapic_data[_ioapic]->base = ioapic; +#ifdef IOAPIC_CACHE + ioapic_cache_init(ioapic_data[_ioapic]); +#endif +} + /* * * IRQ's that are handled by the PIC in the MPS IOAPIC case. @@ -2325,25 +2497,22 @@ static int __init io_apic_bug_finalize(v late_initcall(io_apic_bug_finalize); -struct sysfs_ioapic_data { - struct sys_device dev; - struct IO_APIC_route_entry entry[0]; -}; -static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS]; - static int ioapic_suspend(struct sys_device *dev, pm_message_t state) { struct IO_APIC_route_entry *entry; - struct sysfs_ioapic_data *data; + struct ioapic_data_struct *data; unsigned long flags; int i; + struct ioapic_data_struct *ioapic; - data = container_of(dev, struct sysfs_ioapic_data, dev); + data = container_of(dev, struct ioapic_data_struct, dev); entry = data->entry; + + ioapic = ioapic_data[dev->id]; spin_lock_irqsave(&ioapic_lock, flags); - for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) { - *(((int *)entry) + 1) = io_apic_read(dev->id, 0x11 + 2 * i); - *(((int *)entry) + 0) = io_apic_read(dev->id, 0x10 + 2 * i); + for (i = 0; i < ioapic_data[dev->id]->nr_registers; i ++, entry ++) { + *(((int *)entry) + 1) = io_apic_read(ioapic, 0x11 + 2 * i); + *(((int *)entry) + 0) = io_apic_read(ioapic, 0x10 + 2 * i); } spin_unlock_irqrestore(&ioapic_lock, flags); @@ -2353,23 +2522,25 @@ static int ioapic_suspend(struct sys_dev static int ioapic_resume(struct sys_device *dev) { struct IO_APIC_route_entry *entry; - struct sysfs_ioapic_data *data; + struct ioapic_data_struct *data; unsigned long flags; union IO_APIC_reg_00 reg_00; int i; - - data = container_of(dev, struct sysfs_ioapic_data, dev); + struct ioapic_data_struct *ioapic; + + data = container_of(dev, struct ioapic_data_struct, dev); entry = data->entry; + ioapic = ioapic_data[dev->id]; spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(dev->id, 0); + reg_00.raw = io_apic_read(ioapic, 0); if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) { reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid; - io_apic_write(dev->id, 0, reg_00.raw); + io_apic_write(ioapic, 0, reg_00.raw); } - for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) { - io_apic_write(dev->id, 0x11+2*i, *(((int *)entry)+1)); - io_apic_write(dev->id, 0x10+2*i, *(((int *)entry)+0)); + for (i = 0; i < ioapic_data[dev->id]->nr_registers; i ++, entry ++) { + io_apic_write(ioapic, 0x11+2*i, *(((int *)entry)+1)); + io_apic_write(ioapic, 0x10+2*i, *(((int *)entry)+0)); } spin_unlock_irqrestore(&ioapic_lock, flags); @@ -2392,21 +2563,20 @@ static int __init ioapic_init_sysfs(void return error; for (i = 0; i < nr_ioapics; i++ ) { - size = sizeof(struct sys_device) + nr_ioapic_registers[i] - * sizeof(struct IO_APIC_route_entry); - mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL); - if (!mp_ioapic_data[i]) { + size = ioapic_data[i]->nr_registers * sizeof(struct IO_APIC_route_entry); + ioapic_data[i]->entry = kmalloc(size, GFP_KERNEL); + if (!ioapic_data[i]->entry) { printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); continue; } - memset(mp_ioapic_data[i], 0, size); - dev = &mp_ioapic_data[i]->dev; + memset(ioapic_data[i]->entry, 0, size); + dev = &ioapic_data[i]->dev; dev->id = i; dev->cls = &ioapic_sysdev_class; error = sysdev_register(dev); if (error) { - kfree(mp_ioapic_data[i]); - mp_ioapic_data[i] = NULL; + kfree(ioapic_data[i]->entry); + ioapic_data[i]->entry = NULL; printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); continue; } @@ -2423,13 +2593,14 @@ device_initcall(ioapic_init_sysfs); #ifdef CONFIG_ACPI_BOOT -int __init io_apic_get_unique_id (int ioapic, int apic_id) +int __init io_apic_get_unique_id (int apic, int apic_id) { union IO_APIC_reg_00 reg_00; static physid_mask_t apic_id_map = PHYSID_MASK_NONE; physid_mask_t tmp; unsigned long flags; int i = 0; + struct ioapic_data_struct *ioapic = ioapic_data[apic]; /* * The P4 platform supports up to 256 APIC IDs on two separate APIC @@ -2449,7 +2620,7 @@ int __init io_apic_get_unique_id (int io if (apic_id >= get_physical_broadcast()) { printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying " - "%d\n", ioapic, apic_id, reg_00.bits.ID); + "%d\n", apic, apic_id, reg_00.bits.ID); apic_id = reg_00.bits.ID; } @@ -2468,7 +2639,7 @@ int __init io_apic_get_unique_id (int io panic("Max apic_id exceeded!\n"); printk(KERN_WARNING "IOAPIC[%d]: apic_id %d already used, " - "trying %d\n", ioapic, apic_id, i); + "trying %d\n", apic, apic_id, i); apic_id = i; } @@ -2486,50 +2657,50 @@ int __init io_apic_get_unique_id (int io /* Sanity check */ if (reg_00.bits.ID != apic_id) - panic("IOAPIC[%d]: Unable change apic_id!\n", ioapic); + panic("IOAPIC[%d]: Unable change apic_id!\n", apic); } apic_printk(APIC_VERBOSE, KERN_INFO - "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id); + "IOAPIC[%d]: Assigned apic_id %d\n", apic, apic_id); return apic_id; } -int __init io_apic_get_version (int ioapic) +int __init io_apic_get_version (int apic) { union IO_APIC_reg_01 reg_01; unsigned long flags; spin_lock_irqsave(&ioapic_lock, flags); - reg_01.raw = io_apic_read(ioapic, 1); + reg_01.raw = io_apic_read(ioapic_data[apic], 1); spin_unlock_irqrestore(&ioapic_lock, flags); return reg_01.bits.version; } -int __init io_apic_get_redir_entries (int ioapic) +int __init io_apic_get_redir_entries (int apic) { union IO_APIC_reg_01 reg_01; unsigned long flags; spin_lock_irqsave(&ioapic_lock, flags); - reg_01.raw = io_apic_read(ioapic, 1); + reg_01.raw = io_apic_read(ioapic_data[apic], 1); spin_unlock_irqrestore(&ioapic_lock, flags); return reg_01.bits.entries; } -int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int active_high_low) +int io_apic_set_pci_routing (int apic, int pin, int irq, int edge_level, int active_high_low) { struct IO_APIC_route_entry entry; unsigned long flags; - + struct ioapic_data_struct *ioapic = ioapic_data[apic]; if (!IO_APIC_IRQ(irq)) { printk(KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", - ioapic); + apic); return -EINVAL; } @@ -2552,18 +2723,18 @@ int io_apic_set_pci_routing (int ioapic, * IRQs < 16 are already in the irq_2_pin[] map */ if (irq >= 16) - add_pin_to_irq(irq, ioapic, pin); + add_pin_to_irq(irq, apic, pin); entry.vector = assign_irq_vector(irq); apic_printk(APIC_DEBUG, KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry " - "(%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i)\n", ioapic, - mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq, + "(%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i)\n", apic, + mp_ioapics[apic].mpc_apicid, pin, entry.vector, irq, edge_level, active_high_low); ioapic_register_intr(irq, entry.vector, edge_level); - if (!ioapic && (irq < 16)) + if (!apic && (irq < 16)) disable_8259A_irq(irq); spin_lock_irqsave(&ioapic_lock, flags); Index: linux/arch/i386/kernel/irq.c =================================================================== --- linux.orig/arch/i386/kernel/irq.c +++ linux/arch/i386/kernel/irq.c @@ -51,7 +51,7 @@ static union irq_ctx *softirq_ctx[NR_CPU * SMP cross-CPU interrupts have their own specific * handlers). */ -fastcall unsigned int do_IRQ(struct pt_regs *regs) +fastcall notrace unsigned int do_IRQ(struct pt_regs *regs) { /* high bits used in ret_from_ code */ int irq = regs->orig_eax & 0xff; @@ -59,8 +59,12 @@ fastcall unsigned int do_IRQ(struct pt_r union irq_ctx *curctx, *irqctx; u32 *isp; #endif - irq_enter(); +#ifdef CONFIG_LATENCY_TRACE + if (irq == trace_user_trigger_irq) + user_trace_start(); +#endif + trace_special(regs->eip, irq, 0); #ifdef CONFIG_DEBUG_STACKOVERFLOW /* Debugging check for stack overflow: is there less than 1KB free? */ { @@ -69,7 +73,7 @@ fastcall unsigned int do_IRQ(struct pt_r __asm__ __volatile__("andl %%esp,%0" : "=r" (esp) : "0" (THREAD_SIZE - 1)); if (unlikely(esp < (sizeof(struct thread_info) + STACK_WARN))) { - printk("do_IRQ: stack overflow: %ld\n", + printk("BUG: do_IRQ: stack overflow: %ld\n", esp - sizeof(struct thread_info)); dump_stack(); } @@ -173,7 +177,7 @@ asmlinkage void do_softirq(void) if (in_interrupt()) return; - local_irq_save(flags); + raw_local_irq_save(flags); if (local_softirq_pending()) { curctx = current_thread_info(); @@ -194,7 +198,7 @@ asmlinkage void do_softirq(void) ); } - local_irq_restore(flags); + raw_local_irq_restore(flags); } EXPORT_SYMBOL(do_softirq); @@ -224,8 +228,10 @@ int show_interrupts(struct seq_file *p, } if (i < NR_IRQS) { - spin_lock_irqsave(&irq_desc[i].lock, flags); - action = irq_desc[i].action; + irq_desc_t *desc = irq_desc + i; + + spin_lock_irqsave(&desc->lock, flags); + action = desc->action; if (!action) goto skip; seq_printf(p, "%3d: ",i); @@ -235,15 +241,28 @@ int show_interrupts(struct seq_file *p, for_each_cpu(j) seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]); #endif - seq_printf(p, " %14s", irq_desc[i].handler->typename); + seq_printf(p, " %-14s", desc->handler->typename); +#define F(x,c) ((desc->status & x) ? c : '.') + seq_printf(p, " [%c%c%c%c%c%c%c%c%c%c/", + F(IRQ_INPROGRESS, 'I'), + F(IRQ_DISABLED, 'D'), + F(IRQ_PENDING, 'P'), + F(IRQ_REPLAY, 'R'), + F(IRQ_AUTODETECT, 'A'), + F(IRQ_WAITING, 'W'), + F(IRQ_LEVEL, 'L'), + F(IRQ_MASKED, 'M'), + F(IRQ_PER_CPU, 'C'), + F(IRQ_NODELAY, 'N')); +#undef F + seq_printf(p, "%3d]", desc->irqs_unhandled); seq_printf(p, " %s", action->name); - for (action=action->next; action; action = action->next) seq_printf(p, ", %s", action->name); seq_putc(p, '\n'); skip: - spin_unlock_irqrestore(&irq_desc[i].lock, flags); + spin_unlock_irqrestore(&desc->lock, flags); } else if (i == NR_IRQS) { seq_printf(p, "NMI: "); for_each_cpu(j) @@ -298,9 +317,9 @@ void fixup_irqs(cpumask_t map) barrier(); #else /* That doesn't seem sufficient. Give it 1ms. */ - local_irq_enable(); + raw_local_irq_enable(); mdelay(1); - local_irq_disable(); + raw_local_irq_disable(); #endif } #endif Index: linux/arch/i386/kernel/mcount-wrapper.S =================================================================== --- /dev/null +++ linux/arch/i386/kernel/mcount-wrapper.S @@ -0,0 +1,27 @@ +/* + * linux/arch/i386/mcount-wrapper.S + * + * Copyright (C) 2004 Ingo Molnar + */ + +.globl mcount +mcount: + + cmpl $0, mcount_enabled + jz out + + push %ebp + mov %esp, %ebp + pushl %eax + pushl %ecx + pushl %edx + + call __mcount + + popl %edx + popl %ecx + popl %eax + popl %ebp +out: + ret + Index: linux/arch/i386/kernel/microcode.c =================================================================== --- linux.orig/arch/i386/kernel/microcode.c +++ linux/arch/i386/kernel/microcode.c @@ -109,7 +109,7 @@ MODULE_LICENSE("GPL"); #define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE) /* serialize access to the physical write to MSR 0x79 */ -static DEFINE_SPINLOCK(microcode_update_lock); +static DEFINE_RAW_SPINLOCK(microcode_update_lock); /* no concurrent ->write()s are allowed on /dev/cpu/microcode */ static DECLARE_MUTEX(microcode_sem); Index: linux/arch/i386/kernel/mpparse.c =================================================================== --- linux.orig/arch/i386/kernel/mpparse.c +++ linux/arch/i386/kernel/mpparse.c @@ -261,6 +261,7 @@ static void __init MP_ioapic_info (struc return; } mp_ioapics[nr_ioapics] = *m; + setup_IO_APIC_early(nr_ioapics); nr_ioapics++; } @@ -911,7 +912,7 @@ void __init mp_register_ioapic ( mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE; mp_ioapics[idx].mpc_apicaddr = address; - set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); + setup_IO_APIC_early(idx); if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && (boot_cpu_data.x86 < 15)) mp_ioapics[idx].mpc_apicid = io_apic_get_unique_id(idx, id); else Index: linux/arch/i386/kernel/nmi.c =================================================================== --- linux.orig/arch/i386/kernel/nmi.c +++ linux/arch/i386/kernel/nmi.c @@ -35,7 +35,7 @@ unsigned int nmi_watchdog = NMI_NONE; extern int unknown_nmi_panic; -static unsigned int nmi_hz = HZ; +static unsigned int nmi_hz = 1000; static unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */ static unsigned int nmi_p4_cccr_val; extern void show_registers(struct pt_regs *regs); @@ -113,8 +113,8 @@ static int __init check_nmi_watchdog(voi for (cpu = 0; cpu < NR_CPUS; cpu++) prev_nmi_count[cpu] = per_cpu(irq_stat, cpu).__nmi_count; - local_irq_enable(); - mdelay((10*1000)/nmi_hz); // wait 10 ticks + raw_local_irq_enable(); + mdelay((100*1000)/nmi_hz); // wait 100 ticks for (cpu = 0; cpu < NR_CPUS; cpu++) { #ifdef CONFIG_SMP @@ -135,7 +135,7 @@ static int __init check_nmi_watchdog(voi /* now that we know it works we can reduce NMI frequency to something more reasonable; makes a difference in some configs */ if (nmi_watchdog == NMI_LOCAL_APIC) - nmi_hz = 1; + nmi_hz = 10000; return 0; } @@ -478,13 +478,39 @@ void touch_nmi_watchdog (void) */ for (i = 0; i < NR_CPUS; i++) alert_counter[i] = 0; + + /* + * Tickle the softlockup detector too: + */ + touch_softlockup_watchdog(); } extern void die_nmi(struct pt_regs *, const char *msg); -void nmi_watchdog_tick (struct pt_regs * regs) +int nmi_show_regs[NR_CPUS]; + +void nmi_show_all_regs(void) { + int i; + + if (nmi_watchdog == NMI_NONE) + return; + if (system_state != SYSTEM_RUNNING) { + printk("nmi_show_all_regs(): system state %d, not doing.\n", + system_state); + return; + } + for_each_online_cpu(i) + nmi_show_regs[i] = 1; + for_each_online_cpu(i) + while (nmi_show_regs[i] == 1) + barrier(); +} +static DEFINE_RAW_SPINLOCK(nmi_print_lock); + +void notrace nmi_watchdog_tick (struct pt_regs * regs) +{ /* * Since current_thread_info()-> is always on the stack, and we * always switch the stack NMI-atomically, it's safe to use @@ -492,7 +518,19 @@ void nmi_watchdog_tick (struct pt_regs * */ int sum, cpu = smp_processor_id(); - sum = per_cpu(irq_stat, cpu).apic_timer_irqs; + /* + * Both count the APIC timer irqs and IRQ#0 irqs: + */ + sum = per_cpu(irq_stat, cpu).apic_timer_irqs + kstat_this_cpu.irqs[0]; + + profile_tick(CPU_PROFILING, regs); + if (nmi_show_regs[cpu]) { + nmi_show_regs[cpu] = 0; + spin_lock(&nmi_print_lock); + printk("NMI show regs on CPU#%d:\n", cpu); + show_regs(regs); + spin_unlock(&nmi_print_lock); + } if (last_irq_sums[cpu] == sum) { /* @@ -500,8 +538,24 @@ void nmi_watchdog_tick (struct pt_regs * * wait a few IRQs (5 seconds) before doing the oops ... */ alert_counter[cpu]++; - if (alert_counter[cpu] == 5*nmi_hz) + if (alert_counter[cpu] && !(alert_counter[cpu] % (5*nmi_hz))) { + int i; + + bust_spinlocks(1); + spin_lock(&nmi_print_lock); + printk("NMI watchdog detected lockup on CPU#%d (%d/%d)\n", cpu, alert_counter[cpu], 5*nmi_hz); + show_regs(regs); + spin_unlock(&nmi_print_lock); + + for_each_online_cpu(i) + if (i != cpu) + nmi_show_regs[i] = 1; + for_each_online_cpu(i) + while (nmi_show_regs[i] == 1) + barrier(); + die_nmi(regs, "NMI Watchdog detected LOCKUP"); + } } else { last_irq_sums[cpu] = sum; alert_counter[cpu] = 0; Index: linux/arch/i386/kernel/process.c =================================================================== --- linux.orig/arch/i386/kernel/process.c +++ linux/arch/i386/kernel/process.c @@ -102,12 +102,13 @@ EXPORT_SYMBOL(enable_hlt); void default_idle(void) { if (!hlt_counter && boot_cpu_data.hlt_works_ok) { - local_irq_disable(); - if (!need_resched()) - safe_halt(); + raw_local_irq_disable(); + if (!need_resched() && !need_resched_delayed()) + raw_safe_halt(); else - local_irq_enable(); + raw_local_irq_enable(); } else { + raw_local_irq_enable(); cpu_relax(); } } @@ -124,7 +125,7 @@ static void poll_idle (void) { int oldval; - local_irq_enable(); + raw_local_irq_enable(); /* * Deal with another CPU just having chosen a thread to @@ -139,7 +140,7 @@ static void poll_idle (void) "testl %0, %1;" "rep; nop;" "je 2b;" - : : "i"(_TIF_NEED_RESCHED), "m" (current_thread_info()->flags)); + : : "i"(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED), "m" (current_thread_info()->flags)); clear_thread_flag(TIF_POLLING_NRFLAG); } else { @@ -162,7 +163,7 @@ static inline void play_dead(void) /* * With physical CPU hotplug, we should halt the cpu */ - local_irq_disable(); + raw_local_irq_disable(); while (1) __asm__ __volatile__("hlt":::"memory"); } @@ -185,7 +186,9 @@ void cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { - while (!need_resched()) { + BUG_ON(raw_irqs_disabled()); + + while (!need_resched() && !need_resched_delayed()) { void (*idle)(void); if (__get_cpu_var(cpu_idle_state)) @@ -201,9 +204,13 @@ void cpu_idle(void) play_dead(); __get_cpu_var(irq_stat).idle_timestamp = jiffies; + stop_critical_timing(); + propagate_preempt_locks_value(); idle(); } - schedule(); + raw_local_irq_disable(); + __schedule(); + raw_local_irq_enable(); } } @@ -244,16 +251,16 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait); */ static void mwait_idle(void) { - local_irq_enable(); + raw_local_irq_enable(); - if (!need_resched()) { + if (!need_resched() && !need_resched_delayed()) { set_thread_flag(TIF_POLLING_NRFLAG); do { __monitor((void *)¤t_thread_info()->flags, 0, 0); - if (need_resched()) + if (need_resched() || need_resched_delayed()) break; __mwait(0, 0); - } while (!need_resched()); + } while (!need_resched() && !need_resched_delayed()); clear_thread_flag(TIF_POLLING_NRFLAG); } } @@ -384,11 +391,16 @@ void exit_thread(void) /* The process may have allocated an io port bitmap... nuke it. */ if (unlikely(NULL != t->io_bitmap_ptr)) { - int cpu = get_cpu(); - struct tss_struct *tss = &per_cpu(init_tss, cpu); + int cpu; + struct tss_struct *tss; + void *io_bitmap_ptr = t->io_bitmap_ptr; - kfree(t->io_bitmap_ptr); t->io_bitmap_ptr = NULL; + mb(); + kfree(io_bitmap_ptr); + + cpu = get_cpu(); + tss = &per_cpu(init_tss, cpu); /* * Careful, clear this in the TSS too: */ Index: linux/arch/i386/kernel/reboot.c =================================================================== --- linux.orig/arch/i386/kernel/reboot.c +++ linux/arch/i386/kernel/reboot.c @@ -194,7 +194,7 @@ void machine_real_restart(unsigned char { unsigned long flags; - local_irq_disable(); + raw_local_irq_disable(); /* Write zero to CMOS register number 0x0f, which the BIOS POST routine will recognize as telling it to do a proper reboot. (Well Index: linux/arch/i386/kernel/semaphore.c =================================================================== --- linux.orig/arch/i386/kernel/semaphore.c +++ linux/arch/i386/kernel/semaphore.c @@ -16,6 +16,7 @@ #include #include #include +#include #include /* @@ -49,12 +50,12 @@ * we cannot lose wakeup events. */ -static fastcall void __attribute_used__ __up(struct semaphore *sem) +static fastcall void __attribute_used__ __compat_up(struct compat_semaphore *sem) { wake_up(&sem->wait); } -static fastcall void __attribute_used__ __sched __down(struct semaphore * sem) +static fastcall void __attribute_used__ __sched __compat_down(struct compat_semaphore * sem) { struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); @@ -91,7 +92,7 @@ static fastcall void __attribute_used__ tsk->state = TASK_RUNNING; } -static fastcall int __attribute_used__ __sched __down_interruptible(struct semaphore * sem) +static fastcall int __attribute_used__ __sched __compat_down_interruptible(struct compat_semaphore * sem) { int retval = 0; struct task_struct *tsk = current; @@ -154,7 +155,7 @@ static fastcall int __attribute_used__ _ * single "cmpxchg" without failure cases, * but then it wouldn't work on a 386. */ -static fastcall int __attribute_used__ __down_trylock(struct semaphore * sem) +static fastcall int __attribute_used__ __compat_down_trylock(struct compat_semaphore * sem) { int sleepers; unsigned long flags; @@ -190,15 +191,15 @@ static fastcall int __attribute_used__ _ asm( ".section .sched.text\n" ".align 4\n" -".globl __down_failed\n" -"__down_failed:\n\t" +".globl __compat_down_failed\n" +"__compat_down_failed:\n\t" #if defined(CONFIG_FRAME_POINTER) "pushl %ebp\n\t" "movl %esp,%ebp\n\t" #endif "pushl %edx\n\t" "pushl %ecx\n\t" - "call __down\n\t" + "call __compat_down\n\t" "popl %ecx\n\t" "popl %edx\n\t" #if defined(CONFIG_FRAME_POINTER) @@ -211,15 +212,15 @@ asm( asm( ".section .sched.text\n" ".align 4\n" -".globl __down_failed_interruptible\n" -"__down_failed_interruptible:\n\t" +".globl __compat_down_failed_interruptible\n" +"__compat_down_failed_interruptible:\n\t" #if defined(CONFIG_FRAME_POINTER) "pushl %ebp\n\t" "movl %esp,%ebp\n\t" #endif "pushl %edx\n\t" "pushl %ecx\n\t" - "call __down_interruptible\n\t" + "call __compat_down_interruptible\n\t" "popl %ecx\n\t" "popl %edx\n\t" #if defined(CONFIG_FRAME_POINTER) @@ -232,15 +233,15 @@ asm( asm( ".section .sched.text\n" ".align 4\n" -".globl __down_failed_trylock\n" -"__down_failed_trylock:\n\t" +".globl __compat_down_failed_trylock\n" +"__compat_down_failed_trylock:\n\t" #if defined(CONFIG_FRAME_POINTER) "pushl %ebp\n\t" "movl %esp,%ebp\n\t" #endif "pushl %edx\n\t" "pushl %ecx\n\t" - "call __down_trylock\n\t" + "call __compat_down_trylock\n\t" "popl %ecx\n\t" "popl %edx\n\t" #if defined(CONFIG_FRAME_POINTER) @@ -253,45 +254,20 @@ asm( asm( ".section .sched.text\n" ".align 4\n" -".globl __up_wakeup\n" -"__up_wakeup:\n\t" +".globl __compat_up_wakeup\n" +"__compat_up_wakeup:\n\t" "pushl %edx\n\t" "pushl %ecx\n\t" - "call __up\n\t" + "call __compat_up\n\t" "popl %ecx\n\t" "popl %edx\n\t" "ret" ); -/* - * rw spinlock fallbacks - */ -#if defined(CONFIG_SMP) -asm( -".section .sched.text\n" -".align 4\n" -".globl __write_lock_failed\n" -"__write_lock_failed:\n\t" - LOCK "addl $" RW_LOCK_BIAS_STR ",(%eax)\n" -"1: rep; nop\n\t" - "cmpl $" RW_LOCK_BIAS_STR ",(%eax)\n\t" - "jne 1b\n\t" - LOCK "subl $" RW_LOCK_BIAS_STR ",(%eax)\n\t" - "jnz __write_lock_failed\n\t" - "ret" -); +int fastcall compat_sem_is_locked(struct compat_semaphore *sem) +{ + return (int) atomic_read(&sem->count) < 0; +} + +EXPORT_SYMBOL(compat_sem_is_locked); -asm( -".section .sched.text\n" -".align 4\n" -".globl __read_lock_failed\n" -"__read_lock_failed:\n\t" - LOCK "incl (%eax)\n" -"1: rep; nop\n\t" - "cmpl $1,(%eax)\n\t" - "js 1b\n\t" - LOCK "decl (%eax)\n\t" - "js __read_lock_failed\n\t" - "ret" -); -#endif Index: linux/arch/i386/kernel/signal.c =================================================================== --- linux.orig/arch/i386/kernel/signal.c +++ linux/arch/i386/kernel/signal.c @@ -599,6 +599,13 @@ int fastcall do_signal(struct pt_regs *r int signr; struct k_sigaction ka; +#ifdef CONFIG_PREEMPT_RT + /* + * Fully-preemptible kernel does not need interrupts disabled: + */ + raw_local_irq_enable(); + preempt_check_resched(); +#endif /* * We want the common case to go fast, which * is why we may in certain cases get here from Index: linux/arch/i386/kernel/smp.c =================================================================== --- linux.orig/arch/i386/kernel/smp.c +++ linux/arch/i386/kernel/smp.c @@ -164,7 +164,7 @@ void send_IPI_mask_bitmask(cpumask_t cpu unsigned long cfg; unsigned long flags; - local_irq_save(flags); + raw_local_irq_save(flags); WARN_ON(mask & ~cpus_addr(cpu_online_map)[0]); /* * Wait for idle. @@ -187,7 +187,7 @@ void send_IPI_mask_bitmask(cpumask_t cpu */ apic_write_around(APIC_ICR, cfg); - local_irq_restore(flags); + raw_local_irq_restore(flags); } void send_IPI_mask_sequence(cpumask_t mask, int vector) @@ -201,7 +201,7 @@ void send_IPI_mask_sequence(cpumask_t ma * should be modified to do 1 message per cluster ID - mbligh */ - local_irq_save(flags); + raw_local_irq_save(flags); for (query_cpu = 0; query_cpu < NR_CPUS; ++query_cpu) { if (cpu_isset(query_cpu, mask)) { @@ -228,7 +228,7 @@ void send_IPI_mask_sequence(cpumask_t ma apic_write_around(APIC_ICR, cfg); } } - local_irq_restore(flags); + raw_local_irq_restore(flags); } #include /* must come after the send_IPI functions above for inlining */ @@ -246,7 +246,7 @@ void send_IPI_mask_sequence(cpumask_t ma static cpumask_t flush_cpumask; static struct mm_struct * flush_mm; static unsigned long flush_va; -static DEFINE_SPINLOCK(tlbstate_lock); +static DEFINE_RAW_SPINLOCK(tlbstate_lock); #define FLUSH_ALL 0xffffffff /* @@ -391,7 +391,7 @@ static void flush_tlb_others(cpumask_t c while (!cpus_empty(flush_cpumask)) /* nothing. lockup detection does not belong here */ - mb(); + cpu_relax(); flush_mm = NULL; flush_va = 0; @@ -482,10 +482,20 @@ void smp_send_reschedule(int cpu) } /* + * this function sends a 'reschedule' IPI to all other CPUs. + * This is used when RT tasks are starving and other CPUs + * might be able to run them: + */ +void smp_send_reschedule_allbutself(void) +{ + send_IPI_allbutself(RESCHEDULE_VECTOR); +} + +/* * Structure and data for smp_call_function(). This is designed to minimise * static memory requirements. It also looks cleaner. */ -static DEFINE_SPINLOCK(call_lock); +static DEFINE_RAW_SPINLOCK(call_lock); struct call_data_struct { void (*func) (void *info); @@ -539,7 +549,7 @@ int smp_call_function (void (*func) (voi } /* Can deadlock when called with interrupts disabled */ - WARN_ON(irqs_disabled()); + WARN_ON(raw_irqs_disabled()); data.func = func; data.info = info; @@ -573,7 +583,7 @@ static void stop_this_cpu (void * dummy) * Remove this CPU: */ cpu_clear(smp_processor_id(), cpu_online_map); - local_irq_disable(); + raw_local_irq_disable(); disable_local_APIC(); if (cpu_data[smp_processor_id()].hlt_works_ok) for(;;) __asm__("hlt"); @@ -588,19 +598,20 @@ void smp_send_stop(void) { smp_call_function(stop_this_cpu, NULL, 1, 0); - local_irq_disable(); + raw_local_irq_disable(); disable_local_APIC(); - local_irq_enable(); + raw_local_irq_enable(); } /* - * Reschedule call back. Nothing to do, - * all the work is done automatically when - * we return from the interrupt. + * Reschedule call back. Trigger a reschedule pass so that + * RT-overload balancing can pass tasks around. */ -fastcall void smp_reschedule_interrupt(struct pt_regs *regs) +fastcall notrace void smp_reschedule_interrupt(struct pt_regs *regs) { + trace_special(regs->eip, 0, 0); ack_APIC_irq(); + set_tsk_need_resched(current); } fastcall void smp_call_function_interrupt(struct pt_regs *regs) Index: linux/arch/i386/kernel/smpboot.c =================================================================== --- linux.orig/arch/i386/kernel/smpboot.c +++ linux/arch/i386/kernel/smpboot.c @@ -57,6 +57,7 @@ #include #include #include +#include /* Set if we find a B stepping CPU */ static int __devinitdata smp_b_stepping; @@ -277,6 +278,9 @@ static void __init synchronize_tsc_bp (v wmb(); atomic_inc(&tsc_count_stop); } +#ifdef CONFIG_HIGH_RES_TIMERS + CLEAR_REF_TSC; +#endif sum = 0; for (i = 0; i < NR_CPUS; i++) { @@ -516,7 +520,7 @@ static void __devinit start_secondary(vo per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; /* We can take interrupts now: we're officially "up". */ - local_irq_enable(); + raw_local_irq_enable(); wmb(); cpu_idle(); @@ -1303,9 +1307,9 @@ int __cpu_disable(void) /* We enable the timer again on the exit path of the death loop */ disable_APIC_timer(); /* Allow any queued timer interrupts to get serviced */ - local_irq_enable(); + raw_local_irq_enable(); mdelay(1); - local_irq_disable(); + raw_local_irq_disable(); remove_siblinginfo(cpu); @@ -1350,11 +1354,11 @@ int __devinit __cpu_up(unsigned int cpu) /* In case one didn't come up */ if (!cpu_isset(cpu, cpu_callin_map)) { printk(KERN_DEBUG "skipping cpu%d, didn't come online\n", cpu); - local_irq_enable(); + raw_local_irq_enable(); return -EIO; } - local_irq_enable(); + raw_local_irq_enable(); per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; /* Unleash the CPU! */ cpu_set(cpu, smp_commenced_mask); Index: linux/arch/i386/kernel/time.c =================================================================== --- linux.orig/arch/i386/kernel/time.c +++ linux/arch/i386/kernel/time.c @@ -29,7 +29,10 @@ * Fixed a xtime SMP race (we need the xtime_lock rw spinlock to * serialize accesses to xtime/lost_ticks). */ - +/* 2002-8-13 George Anzinger Modified for High res timers: + * Copyright (C) 2002 MontaVista Software +*/ +#define _INCLUDED_FROM_TIME_C #include #include #include @@ -65,6 +68,8 @@ #include #include +#include +#include #include "io_ports.h" @@ -78,17 +83,20 @@ u64 jiffies_64 = INITIAL_JIFFIES; EXPORT_SYMBOL(jiffies_64); +/* Using APIC to generate smp_local_timer_interrupt? */ +int using_apic_timer = 0; + unsigned int cpu_khz; /* Detected as we calibrate the TSC */ EXPORT_SYMBOL(cpu_khz); extern unsigned long wall_jiffies; -DEFINE_SPINLOCK(rtc_lock); +DEFINE_RAW_SPINLOCK(rtc_lock); EXPORT_SYMBOL(rtc_lock); #include -DEFINE_SPINLOCK(i8253_lock); +DEFINE_RAW_SPINLOCK(i8253_lock); EXPORT_SYMBOL(i8253_lock); struct timer_opts *cur_timer __read_mostly = &timer_none; @@ -126,46 +134,58 @@ EXPORT_SYMBOL(rtc_cmos_write); * This version of gettimeofday has microsecond resolution * and better than microsecond precision on fast x86 machines with TSC. */ + +/* + * High res timers changes: First we want to use full nsec for all + * the math to avoid the double round off (on the offset and xtime). + * Second, we want to allow a boot with HRT turned off at boot time. + * This will cause hrtimer_use to be false, and we then fall back to + * the old code. We also shorten the xtime lock region and eliminate + * the lost tick code as this kernel will never have lost ticks under + * the lock (i.e. wall_jiffies will never differ from jiffies except + * when the write xtime lock is held). + */ void do_gettimeofday(struct timeval *tv) { unsigned long seq; - unsigned long usec, sec; + unsigned long sec, nsec, clk_nsec; unsigned long max_ntp_tick; do { - unsigned long lost; - seq = read_seqbegin(&xtime_lock); - - usec = cur_timer->get_offset(); - lost = jiffies - wall_jiffies; - - /* - * If time_adjust is negative then NTP is slowing the clock - * so make sure not to go into next possible interval. - * Better to lose some accuracy than have time go backwards.. - */ - if (unlikely(time_adjust < 0)) { - max_ntp_tick = (USEC_PER_SEC / HZ) - tickadj; - usec = min(usec, max_ntp_tick); - - if (lost) - usec += lost * max_ntp_tick; - } - else if (unlikely(lost)) - usec += lost * (USEC_PER_SEC / HZ); +#ifdef CONFIG_HIGH_RES_TIMERS + nsec = arch_cycle_to_nsec(get_arch_cycles(wall_jiffies)); +#else + nsec = cur_timer->get_offset() * NSEC_PER_USEC; +#endif sec = xtime.tv_sec; - usec += (xtime.tv_nsec / 1000); + clk_nsec = xtime.tv_nsec; } while (read_seqretry(&xtime_lock, seq)); - while (usec >= 1000000) { - usec -= 1000000; + /* + * If time_adjust is negative then NTP is slowing the clock + * so make sure not to go into next possible interval. + * Better to lose some accuracy than have time go backwards.. + + * Note, in this kernel wall_jiffies and jiffies will always + * be the same, at least under the lock. + */ + if (unlikely(time_adjust < 0)) { + max_ntp_tick = tick_nsec - (tickadj * NSEC_PER_USEC); + if (max_ntp_tick > nsec) + nsec = max_ntp_tick - nsec; + } + + nsec += clk_nsec; + + while (nsec >= NSEC_PER_SEC) { + nsec -= NSEC_PER_SEC; sec++; } tv->tv_sec = sec; - tv->tv_usec = usec; + tv->tv_usec = nsec / NSEC_PER_USEC; } EXPORT_SYMBOL(do_gettimeofday); @@ -236,7 +256,7 @@ unsigned long long monotonic_clock(void) EXPORT_SYMBOL(monotonic_clock); #if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER) -unsigned long profile_pc(struct pt_regs *regs) +unsigned long notrace profile_pc(struct pt_regs *regs) { unsigned long pc = instruction_pointer(regs); @@ -252,41 +272,26 @@ EXPORT_SYMBOL(profile_pc); * timer_interrupt() needs to keep up the real-time clock, * as well as call the "do_timer()" routine every clocktick */ -static inline void do_timer_interrupt(int irq, void *dev_id, - struct pt_regs *regs) +static inline void do_timer_interrupt(struct pt_regs *regs) { #ifdef CONFIG_X86_IO_APIC if (timer_ack) { + unsigned long flags; /* * Subtle, when I/O APICs are used we have to ack timer IRQ * manually to reset the IRR bit for do_slow_gettimeoffset(). * This will also deassert NMI lines for the watchdog if run * on an 82489DX-based system. */ - spin_lock(&i8259A_lock); + spin_lock_irqsave(&i8259A_lock, flags); outb(0x0c, PIC_MASTER_OCW3); /* Ack the IRQ; AEOI will end it automatically. */ inb(PIC_MASTER_POLL); - spin_unlock(&i8259A_lock); + spin_unlock_irqrestore(&i8259A_lock, flags); } #endif do_timer_interrupt_hook(regs); - - - if (MCA_bus) { - /* The PS/2 uses level-triggered interrupts. You can't - turn them off, nor would you want to (any attempt to - enable edge-triggered interrupts usually gets intercepted by a - special hardware circuit). Hence we have to acknowledge - the timer interrupt. Through some incredibly stupid - design idea, the reset for IRQ 0 is done by setting the - high bit of the PPI port B (0x61). Note that some PS/2s, - notably the 55SX, work fine if this is removed. */ - - irq = inb_p( 0x61 ); /* read the current state */ - outb_p( irq|0x80, 0x61 ); /* reset the IRQ */ - } } /* @@ -307,12 +312,109 @@ irqreturn_t timer_interrupt(int irq, voi cur_timer->mark_offset(); - do_timer_interrupt(irq, NULL, regs); + do_timer_interrupt(regs); +#ifdef CONFIG_MCA + /* + * This code moved here from do_timer_interrupt() as part of the + * high-res timers change because it should be done every interrupt + * but do_timer_interrupt() wants to return early if it is not a + * "1/HZ" tick interrupt. For non-high-res systems the code is in + * exactly the same location (i.e. it is moved from the tail of the + * above called function to the next thing after the function). + */ + if( MCA_bus ) { + int irq; + /* The PS/2 uses level-triggered interrupts. You can't + turn them off, nor would you want to (any attempt to + enable edge-triggered interrupts usually gets intercepted by a + special hardware circuit). Hence we have to acknowledge + the timer interrupt. Through some incredibly stupid + design idea, the reset for IRQ 0 is done by setting the + high bit of the PPI port B (0x61). Note that some PS/2s, + notably the 55SX, work fine if this is removed. */ + + irq = inb_p( 0x61 ); /* read the current state */ + outb_p( irq|0x80, 0x61 ); /* reset the IRQ */ + } +#endif write_sequnlock(&xtime_lock); +#if defined(CONFIG_SMP) && defined(CONFIG_HIGH_RES_TIMERS) + send_IPI_allbutself(LOCAL_TIMER_IPI_VECTOR); + // we profile via the NMI +#if 0 + profile_tick(CPU_PROFILING, regs); +#endif + per_cpu(irq_stat, smp_processor_id()).apic_timer_irqs++; + update_process_times(user_mode(regs)); +#endif return IRQ_HANDLED; } +#ifdef CONFIG_HIGH_RES_TIMERS +/* + * We always continue to provide interrupts even if they are not + * serviced. To do this, we leave the chip in periodic mode programmed + * to interrupt every jiffie. This is done by, for short intervals, + * programming a short time, waiting till it is loaded and then + * programming the 1/HZ. The chip will not load the 1/HZ count till the + * short count expires. If the last interrupt was programmed to be + * short, we need to program another short to cover the remaining part + * of the jiffie and can then just leave the chip alone. Note that it + * is also a low overhead way of doing things as we do not have to mess + * with the chip MOST of the time. + */ + +/* Called with xtime lock held */ +void schedule_jiffy_int(void) +{ + unsigned long cycles; + BUG_ON(!highres_active || using_apic_timer); + + if (highres_pending) + printk("Highres pending\n"); + cycles = get_arch_cycles(jiffies); + reload_timer_chip(arch_cycles_per_jiffy - cycles); +} + +/* + * We get the distance to the next event in arch_cycles + * Called with xtime lock held + */ +int _schedule_next_int(long arch_cycle_in) +{ + long arch_cycle_offset; + + BUG_ON(!highres_active); + + /* + * If time is already passed, just return saying so. + */ + if (arch_cycle_in <= 0) + return -ETIME; + + highres_pending = 1; + arch_cycle_offset = get_arch_cycles(jiffies); + + if (!using_apic_timer) { + /* Check, if we have to schedule a jiffy first */ + arch_cycle_offset = arch_cycles_per_jiffy - arch_cycle_offset; + if (arch_cycle_in > arch_cycle_offset) + arch_cycle_in = arch_cycle_offset; + } + + reload_timer_chip(arch_cycle_in); + return 0; +} + +#ifdef CONFIG_APM +void restart_timer(void) +{ + start_PIT(); +} +#endif /* CONFIG_APM */ +#endif /* CONFIG_HIGH_RES_TIMERS */ + /* not static: needed by APM */ unsigned long get_cmos_time(void) { @@ -333,14 +435,20 @@ EXPORT_SYMBOL(get_cmos_time); static void sync_cmos_clock(unsigned long dummy); -static struct timer_list sync_cmos_timer = - TIMER_INITIALIZER(sync_cmos_clock, 0, 0); +static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0); static void sync_cmos_clock(unsigned long dummy) { struct timeval now, next; int fail = 1; + /* + * This is dumb for two reasons. + * 1.) it is based on wall time which has not yet been updated. + * 2.) it is checked each tick for something that happens each + * 10 min. Why not use a timer for it? Much lower overhead, + * in fact, zero if STA_UNSYNC is set. + */ /* * If we have an externally synchronized Linux clock, then update * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be @@ -412,6 +520,7 @@ static int timer_resume(struct sys_devic write_sequnlock_irqrestore(&xtime_lock, flags); jiffies += sleep_length; wall_jiffies += sleep_length; + touch_softlockup_watchdog(); return 0; } Index: linux/arch/i386/kernel/time_hpet.c =================================================================== --- linux.orig/arch/i386/kernel/time_hpet.c +++ linux/arch/i386/kernel/time_hpet.c @@ -302,11 +302,11 @@ int hpet_rtc_timer_init(void) else hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ; - local_irq_save(flags); + raw_local_irq_save(flags); cnt = hpet_readl(HPET_COUNTER); cnt += ((hpet_tick*HZ)/hpet_rtc_int_freq); hpet_writel(cnt, HPET_T1_CMP); - local_irq_restore(flags); + raw_local_irq_restore(flags); cfg = hpet_readl(HPET_T1_CFG); cfg |= HPET_TN_ENABLE | HPET_TN_SETVAL | HPET_TN_32BIT; Index: linux/arch/i386/kernel/timers/Makefile =================================================================== --- linux.orig/arch/i386/kernel/timers/Makefile +++ linux/arch/i386/kernel/timers/Makefile @@ -7,3 +7,6 @@ obj-y := timer.o timer_none.o timer_tsc. obj-$(CONFIG_X86_CYCLONE_TIMER) += timer_cyclone.o obj-$(CONFIG_HPET_TIMER) += timer_hpet.o obj-$(CONFIG_X86_PM_TIMER) += timer_pm.o +obj-$(CONFIG_HIGH_RES_TIMER_ACPI_PM) += hrtimer_pm.o +obj-$(CONFIG_HIGH_RES_TIMER_ACPI_PM) += high-res-tbxfroot.o +obj-$(CONFIG_HIGH_RES_TIMER_TSC) += hrtimer_tsc.o \ No newline at end of file Index: linux/arch/i386/kernel/timers/common.c =================================================================== --- linux.orig/arch/i386/kernel/timers/common.c +++ linux/arch/i386/kernel/timers/common.c @@ -23,7 +23,7 @@ * device. */ -#define CALIBRATE_TIME (5 * 1000020/HZ) +__initdata unsigned long tsc_cycles_per_50_ms; unsigned long calibrate_tsc(void) { @@ -58,6 +58,12 @@ unsigned long calibrate_tsc(void) if (endlow <= CALIBRATE_TIME) goto bad_ctc; + /* + * endlow at this point is 50 ms of arch clocks + * Set up the value for other who want high res. + */ + tsc_cycles_per_50_ms = endlow; + __asm__("divl %2" :"=a" (endlow), "=d" (endhigh) :"r" (endlow), "0" (0), "1" (CALIBRATE_TIME)); @@ -71,6 +77,7 @@ unsigned long calibrate_tsc(void) * 32 bits.. */ bad_ctc: + printk("******************** TSC calibrate failed!\n"); return 0; } Index: linux/arch/i386/kernel/timers/high-res-tbxfroot.c =================================================================== --- /dev/null +++ linux/arch/i386/kernel/timers/high-res-tbxfroot.c @@ -0,0 +1,273 @@ +/****************************************************************************** + * + * Module Name: tbxfroot - Find the root ACPI table (RSDT) + * $Revision: 1.4 $ + * + *****************************************************************************/ + +/* + * Copyright (C) 2000, 2001 R. Byron Moore + + * This code purloined and modified by George Anzinger + * Copyright (C) 2002 by MontaVista Software. + * It is part of the high-res-timers ACPI option and its sole purpose is + * to find the darn timer. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +/* This is most annoying! We want to find the address of the pm timer in the + * ACPI hardware package. We know there is one if ACPI is available at all + * as it is part of the basic ACPI hardware set. + * However, the powers that be have conspired to make it a real + * pain to find the address. We have written a minimal search routine + * that we use only once on boot up. We try to cover all the bases including + * checksum, and version. We will try to get some constants and structures + * from the ACPI code in an attempt to follow it, but darn, what a mess. + * + * First problem, the include files are in the driver package.... + * and what a mess they are. We pick up the kernel string and types first. + + * But then there is the COMPILER_DEPENDENT_UINT64 ... + */ +//#define ACPI_MACHINE_WIDTH BITS_PER_LONG +#define COMPILER_DEPENDENT_UINT64 unsigned long long +#define COMPILER_DEPENDENT_INT64 long long +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define STRNCMP(d,s,n) strncmp((d), (s), (NATIVE_INT)(n)) +#define RSDP_CHECKSUM_LENGTH 20 +#define NATIVE_INT INT32 +#define NATIVE_CHAR char + +#ifndef CONFIG_ACPI_enough /* I am tired of trying to use the acpi stuff */ + /* this code works, lets just use it. */ +/******************************************************************************* + * + * FUNCTION: hrt_acpi_checksum + * + * PARAMETERS: Buffer - Buffer to checksum + * Length - Size of the buffer + * + * RETURNS 8 bit checksum of buffer + * + * DESCRIPTION: Computes an 8 bit checksum of the buffer(length) and returns it. + * + ******************************************************************************/ +static __init u8 +hrt_acpi_checksum(void *buffer, u32 length) +{ + u8 *limit; + u8 *rover; + u8 sum = 0; + + if (buffer && length) { + /* Buffer and Length are valid */ + + limit = (u8 *) buffer + length; + + for (rover = buffer; rover < limit; rover++) { + sum = (u8) (sum + *rover); + } + } + + return (sum); +} + +/******************************************************************************* + * + * FUNCTION: hrt_acpi_scan_memory_for_rsdp + * + * PARAMETERS: Start_address - Starting pointer for search + * Length - Maximum length to search + * + * RETURN: Pointer to the RSDP if found, otherwise NULL. + * + * DESCRIPTION: Search a block of memory for the RSDP signature + * + ******************************************************************************/ +static __init u8 * +hrt_acpi_scan_memory_for_rsdp(u8 * start_address, u32 length) +{ + u32 offset; + u8 *mem_rover; + + /* Search from given start addr for the requested length */ + + for (offset = 0, mem_rover = start_address; + offset < length; + offset += ACPI_RSDP_SCAN_STEP, mem_rover += ACPI_RSDP_SCAN_STEP) { + + /* The signature and checksum must both be correct */ + + if (STRNCMP((NATIVE_CHAR *) mem_rover, + RSDP_SIG, sizeof (RSDP_SIG) - 1) == 0 && + hrt_acpi_checksum(mem_rover, RSDP_CHECKSUM_LENGTH) == 0) { + /* If so, we have found the RSDP */ + + ; + return (mem_rover); + } + } + + /* Searched entire block, no RSDP was found */ + + return (NULL); +} + +/******************************************************************************* + * + * FUNCTION: hrt_acpi_find_rsdp + * + * PARAMETERS: + * + * RETURN: Logical address of rsdp + * + * DESCRIPTION: Search lower 1_mbyte of memory for the root system descriptor + * pointer structure. If it is found, return its address, + * else return 0. + * + * NOTE: The RSDP must be either in the first 1_k of the Extended + * BIOS Data Area or between E0000 and FFFFF (ACPI 1.0 section + * 5.2.2; assertion #421). + * + ******************************************************************************/ +/* Constants used in searching for the RSDP in low memory */ + +#define LO_RSDP_WINDOW_BASE 0 /* Physical Address */ +#define HI_RSDP_WINDOW_BASE 0xE0000 /* Physical Address */ +#define LO_RSDP_WINDOW_SIZE 0x400 +#define HI_RSDP_WINDOW_SIZE 0x20000 +#define RSDP_DESCRIPTOR struct rsdp_descriptor +static __init RSDP_DESCRIPTOR * +hrt_find_acpi_rsdp(void) +{ + u8 *mem_rover; + + /* + * 1) Search EBDA (low memory) paragraphs + */ + mem_rover = + hrt_acpi_scan_memory_for_rsdp((u8 *) __va(LO_RSDP_WINDOW_BASE), + LO_RSDP_WINDOW_SIZE); + + if (!mem_rover) { + /* + * 2) Search upper memory: + * 16-byte boundaries in E0000h-F0000h + */ + mem_rover = + hrt_acpi_scan_memory_for_rsdp((u8 *) + __va(HI_RSDP_WINDOW_BASE), + HI_RSDP_WINDOW_SIZE); + } + + if (mem_rover) { + /* Found it, return the logical address */ + + return (RSDP_DESCRIPTOR *) mem_rover; + } + return (RSDP_DESCRIPTOR *) 0; +} + +__init u32 hrt_get_acpi_pm_ptr(void) +{ + struct fadt_descriptor_rev2 *fadt2; + struct fadt_descriptor_rev1 *fadt1; + struct rsdt_descriptor_rev1 *rsdt; + struct xsdt_descriptor_rev2 *xsdt; + RSDP_DESCRIPTOR *rsdp = hrt_find_acpi_rsdp(); + struct acpi_table_header *header; + u32 rtn; + + if (!rsdp) { + printk("ACPI: System description tables not found\n"); + return 0; + } + /* + * Now that we have that problem out of the way, lets set up this + * timer. We need to figure the addresses based on the revision + * of ACPI, which is in this here table we just found. + * We will not check the RSDT checksum, but will the FADT. + */ + if (rsdp->revision == 2) { + xsdt = + (struct xsdt_descriptor_rev2 *) + __va(rsdp->xsdt_physical_address); + fadt2 = + (struct fadt_descriptor_rev2 *) + __va(xsdt->table_offset_entry[0]); + header = (struct acpi_table_header *) fadt2; + rtn = (u32) fadt2->xpm_tmr_blk.address; + } else { + rsdt = + (struct rsdt_descriptor_rev1 *) + __va(rsdp->rsdt_physical_address); + fadt1 = + (struct fadt_descriptor_rev1 *) + __va(rsdt->table_offset_entry[0]); + header = (struct acpi_table_header *) fadt1; + rtn = (u32) fadt1->pm_tmr_blk; + } + /* + * Verify the signature and the checksum, if good, return + * the address. + */ + if (STRNCMP((NATIVE_CHAR *) header->signature, + FADT_SIG, sizeof (FADT_SIG) - 1) == 0 && + hrt_acpi_checksum((NATIVE_CHAR *) header, header->length) == 0) + return rtn; + + printk("ACPI: Signature or checksum failed on FADT\n"); + return 0; +} + +#else +extern int acpi_get_firmware_table(acpi_string signature, + u32 instance, + u32 flags, + struct acpi_table_header ** table_pointer); + +extern struct fadt_descriptor_rev2 acpi_fadt; +__init u32 hrt_get_acpi_pm_ptr(void) +{ + struct fadt_descriptor_rev2 *fadt = &acpi_fadt; + struct fadt_descriptor_rev2 local_fadt; + + if (!fadt || !fadt->header.signature[0]) { + fadt = &local_fadt; + fadt->header.signature[0] = '\0'; + acpi_get_firmware_table("FACP", 1, ACPI_PHYSICAL_POINTER, + (struct acpi_table_header **) & fadt); + } + if (!fadt || !fadt->header.signature[0]) { + printk("ACPI: Could not find the ACPI pm timer."); + } + + if (fadt->header.revision == 2) { + return (u32) fadt->xpm_tmr_blk.address; + } else { + return (u32) fadt->V1_pm_tmr_blk; + } +} +#endif Index: linux/arch/i386/kernel/timers/hrtimer_pm.c =================================================================== --- /dev/null +++ linux/arch/i386/kernel/timers/hrtimer_pm.c @@ -0,0 +1,200 @@ +/* + * This code largely moved from arch/i386/kernel/time.c. + * See comments there for proper credits. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include + +#define OK_TO_DO_IO_LOOP // How to do the delay stuff. + + +extern unsigned long do_highres_gettimeoffset_pm(void) +{ + /* + * We are under the xtime_lock here. + */ + return arch_cycle_to_usec(get_arch_cycles(jiffies)); +} + +static void high_res_mark_offset_pm(void) +{ + return; +} +unsigned long long monotonic_clock_hr_pm(void) +{ + unsigned long long timestamp; + unsigned long seq; + do { + seq = read_seqbegin(&xtime_lock); + timestamp = jiffies_64 * (NSEC_PER_SEC / HZ) + + arch_cycle_to_usec(get_arch_cycles(jiffies)); + } while (read_seqretry(&xtime_lock, seq)); + return timestamp; +} + +#ifdef OK_TO_DO_IO_LOOP +/* + * This routine is I/O intensive. If this is a problem we will have to + * use a compute loop as in the PIT code. It is NOT affected by the + * cpu clock, however. + */ +static void delay_pm(unsigned long loops) +{ + unsigned long bclock = inl(acpi_pm_tmr_address); + + /* + * XXX it doesn't depend on a number of processor cycles so + * the value may be very different from the usual one, is that + * a problem? -eric + */ + do { + rep_nop(); + } while (((inl(acpi_pm_tmr_address) - bclock) & SIZE_MASK) < loops); +} +#else +/* + * Avoids the I/O intense stuff but is affected by cpu clock shifting. + */ +static void delay_pm(unsigned long loops) +{ + int d0; + __asm__ __volatile__( + "\tjmp 1f\n" + ".align 16\n" + "1:\tjmp 2f\n" + ".align 16\n" + "2:\tdecl %0\n\tjns 2b" + :"=&a" (d0) + :"0" (loops)); +} + +#endif + +#ifdef CONFIG_CPU_FREQ +static unsigned int ref_freq = 0; +static unsigned int variable_tsc = 1; + +#ifdef OK_TO_DO_IO_LOOP +static unsigned long loops_per_jiffy_ref = 0; +#endif + +#ifndef CONFIG_SMP +static unsigned long cpu_khz_ref = 0; +#endif + +static int +time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, + void *data) +{ + struct cpufreq_freqs *freq = data; + + if (!ref_freq) { + ref_freq = freq->old; +#ifdef OK_TO_DO_IO_LOOP + loops_per_jiffy_ref = cpu_data[freq->cpu].loops_per_jiffy; +#endif + } + + if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) || + (val == CPUFREQ_POSTCHANGE && freq->old > freq->new)) { +#ifdef OK_TO_DO_IO_LOOP + if (variable_tsc) + cpu_data[freq->cpu].loops_per_jiffy = + cpufreq_scale(loops_per_jiffy_ref, + ref_freq, freq->new); +#endif + } + + return 0; +} + +static struct notifier_block time_cpufreq_notifier_block = { +.notifier_call = time_cpufreq_notifier +}; +#endif + + +static int high_res_init_pm(char * override) +{ + if (override[0] && strncmp(override,"hr_pm",5)) + return -ENODEV; + + /* report CPU clock rate in Hz. + * The formula is: + * (10^6 * 2^32) / (2^32 * 1 / (clocks/us)) = + * clock/second. Our precision is about 100 ppm. + */ + if (cpu_has_tsc) { + unsigned long tsc_quotient = calibrate_tsc(); + if(tsc_quotient){ + cpu_khz = div_sc32( 1000, tsc_quotient); + { + printk("Detected %u.%03u MHz processor.\n", + cpu_khz / 1000, cpu_khz % 1000); + } + } + } + acpi_pm_tmr_address = hrt_get_acpi_pm_ptr(); + if (!acpi_pm_tmr_address){ + printk(pm_hrtimer_message, default_pm_add); + if ( (acpi_pm_tmr_address = default_pm_add)){ + last_update += quick_get_cpuctr(); + hrt_udelay(4); + if (!quick_get_cpuctr()){ + printk("High-res-timers: No ACPI pm " + "timer found at %d.\n", + acpi_pm_tmr_address); + acpi_pm_tmr_address = 0; + } + } + } else { + if (default_pm_add != acpi_pm_tmr_address) { + printk("High-res-timers: Ignoring supplied " + "default ACPI pm timer address.\n"); + } + last_update += quick_get_cpuctr(); + } + start_PIT(); + if (!acpi_pm_tmr_address){ + printk(pm_hrtimer_fail_message); + return -EINVAL; + } else { + printk("High-res-timers: Found ACPI pm timer at %d\n", + acpi_pm_tmr_address); + } +#ifdef CONFIG_CPU_FREQ + /* P4 and above CPU TSC freq doesn't change when CPU frequency changes*/ + if ((boot_cpu_data.x86 >= 15) && + (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)) + variable_tsc = 0; + + cpufreq_register_notifier(&time_cpufreq_notifier_block, + CPUFREQ_TRANSITION_NOTIFIER); +#endif + return 0; +} + +/************************************************************/ + + +/* hr_pm timer_opts struct */ +struct timer_opts hrtimer_pm = { + .name = "hrt_pm", + .mark_offset = high_res_mark_offset_pm, + .get_offset = do_highres_gettimeoffset_pm, + .monotonic_clock = monotonic_clock_hr_pm, + .delay = delay_pm, +}; + +struct init_timer_opts __initdata hrtimer_pm_init = { + .init = high_res_init_pm, + .opts = &hrtimer_pm, +}; Index: linux/arch/i386/kernel/timers/hrtimer_tsc.c =================================================================== --- /dev/null +++ linux/arch/i386/kernel/timers/hrtimer_tsc.c @@ -0,0 +1,274 @@ +/* + * This code largely moved from arch/i386/kernel/time.c. + * See comments there for proper credits. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "mach_timer.h" + +extern int x86_udelay_tsc; +extern unsigned long tsc_cycles_per_50_ms; + +#if 0 //ndef CONFIG_SMP +static int use_tsc; +#endif + +/* Cached *multiplier* to convert TSC counts to microseconds. + * (see the equation below). + * Equal to 2^32 * (1 / (clocks per usec) ). + * Initialized in time_init. + */ +static unsigned long fast_gettimeoffset_quotient; + +static unsigned long do_highres_gettimeoffset(void) +{ + /* + * We are under the xtime_lock here. + */ + return arch_cycle_to_usec(get_arch_cycles(jiffies)); +} + +static void high_res_mark_offset_tsc(void) +{ + return; +} +static unsigned long long monotonic_clock_hr_tsc(void) +{ + unsigned long long timestamp; + unsigned long seq; + do { + seq = read_seqbegin(&xtime_lock); + timestamp = jiffies_64 * (NSEC_PER_SEC / HZ) + + arch_cycle_to_usec(get_arch_cycles(jiffies)); + } while (read_seqretry(&xtime_lock, seq)); + return timestamp; +} +static void delay_tsc(unsigned long loops) +{ + unsigned long bclock, now; + + rdtscl(bclock); + do + { + rep_nop(); + rdtscl(now); + } while ((now-bclock) < loops); +} + +#ifdef CONFIG_CPU_FREQ +/* If the CPU frequency is scaled, TSC-based delays will need a different + * loops_per_jiffy value to function properly. An exception to this + * are modern Intel Pentium 4 processors, where the TSC runs at a constant + * speed independent of frequency scaling. + */ +static unsigned long ref_arch_to_usec; +static unsigned long ref_arch_to_latch; +static unsigned long ref_arch_to_nsec; +static unsigned long ref_usec_to_arch; +static unsigned long ref_nsec_to_arch; +static long ref_arch_cycles_per_jiffy; +static unsigned int ref_freq = 0; +static unsigned long loops_per_jiffy_ref = 0; +static unsigned int variable_tsc = 1; + +#ifndef CONFIG_SMP +static unsigned long cpu_khz_ref = 0; +#endif + +static int +time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, + void *data) +{ + struct cpufreq_freqs *freq = data; + + if (!ref_freq) { + ref_freq = freq->old; + loops_per_jiffy_ref = cpu_data[freq->cpu].loops_per_jiffy; + ref_arch_to_usec = arch_to_usec; + ref_arch_to_latch = arch_to_latch; + ref_arch_to_nsec = arch_to_nsec; + ref_nsec_to_arch = nsec_to_arch; + ref_usec_to_arch = usec_to_arch; + ref_arch_cycles_per_jiffy = arch_cycles_per_jiffy; +#if 0 /* ndef CONFIG_SMP cpu_khz is done in timer_tsc.c */ + cpu_khz_ref = cpu_khz; +#endif + } + + if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) || + (val == CPUFREQ_POSTCHANGE && freq->old > freq->new)) { + + if (variable_tsc) { + cpu_data[freq->cpu].loops_per_jiffy = + cpufreq_scale(loops_per_jiffy_ref, + ref_freq, freq->new); + + arch_to_usec = + /* fast_gettimeoffset_quotient is done + * timer_tsc.c + fast_gettimeoffset_quotient = + */ + cpufreq_scale(ref_arch_to_usec, + freq->new, ref_freq); + arch_to_latch = + cpufreq_scale(ref_arch_to_latch, + freq->new, ref_freq); + arch_to_nsec = + cpufreq_scale(ref_arch_to_nsec, + freq->new, ref_freq); + nsec_to_arch = + cpufreq_scale(ref_nsec_to_arch, + ref_freq, freq->new); + usec_to_arch = + cpufreq_scale(ref_usec_to_arch, + ref_freq, freq->new); + arch_cycles_per_jiffy = + cpufreq_scale(ref_arch_cycles_per_jiffy, + ref_freq, freq->new); + } +#if 0 /* ndef CONFIG_SMP_use_timer_tsc */ + if (use_tsc) + cpu_khz = cpufreq_scale(cpu_khz_ref, ref_freq, freq->new); +#endif + } + + return 0; +} + +static struct notifier_block time_cpufreq_notifier_block = { + notifier_call: time_cpufreq_notifier +}; +#endif + +static int high_res_init_tsc(char * override) +{ + if (override[0] && strncmp(override,"hrtsc",5)) + return -ENODEV; + + if (tsc_disable) { + printk(KERN_WARNING "notsc: Kernel compiled with " + "CONFIG_HIGH_RES_TIMERS" + " TSC, cannot disable TSC.\n"); + tsc_disable = 0; + } + /* + * If we have APM enabled or the CPU clock speed is variable + * (CPU stops clock on HLT or slows clock to save power) + * then the TSC timestamps may diverge by up to 1 jiffy from + * 'real time' but nothing will break. + * The most frequent case is that the CPU is "woken" from a halt + * state by the timer interrupt itself, so we get 0 error. In the + * rare cases where a driver would "wake" the CPU and request a + * timestamp, the maximum error is < 1 jiffy. But timestamps are + * still perfectly ordered. + * Note that the TSC counter will be reset if APM suspends + * to disk; this won't break the kernel, though, 'cuz we're + * smart. See arch/i386/kernel/apm.c. + */ + /* + * Firstly we have to do a CPU check for chips with + * a potentially buggy TSC. At this point we haven't run + * the ident/bugs checks so we must run this hook as it + * may turn off the TSC flag. + * + * NOTE: this doesnt yet handle SMP 486 machines where only + * some CPU's have a TSC. Thats never worked and nobody has + * moaned if you have the only one in the world - you fix it! + */ + + /* FIXME: Where is this gone ? */ + //dodgy_tsc(); + + if (cpu_has_tsc) { + unsigned long tsc_quotient = calibrate_tsc(); + if (tsc_quotient) { + fast_gettimeoffset_quotient = tsc_quotient; + /* + * We could be more selective here I suspect + * and just enable this for the next intel chips ? + */ + /* + * Kick off the high res timers + */ + /* + * The init_hrtimers macro is in the choosen + * support package depending on the clock + * source, TSC, or ACPI pm timer. + */ + arch_to_usec = fast_gettimeoffset_quotient; + + arch_to_latch = div_ll_X_l( + mpy_l_X_l_ll(fast_gettimeoffset_quotient, + CLOCK_TICK_RATE), + (USEC_PER_SEC)); + + arch_to_nsec = div_sc_n(HR_TIME_SCALE_NSEC, + CALIBRATE_TIME * NSEC_PER_USEC, + tsc_cycles_per_50_ms); + + nsec_to_arch = div_sc_n(HR_TIME_SCALE_NSEC, + tsc_cycles_per_50_ms, + CALIBRATE_TIME * NSEC_PER_USEC); + + usec_to_arch = div_sc_n(HR_TIME_SCALE_USEC, + tsc_cycles_per_50_ms, + CALIBRATE_TIME ); + + arch_cycles_per_jiffy = nsec_to_arch_cycle(tick_nsec); + init_hrtimers(); + + start_PIT(); + + /* report CPU clock rate in Hz. + * The formula is: + * (10^6 * 2^32) / (2^32 * 1 / (clocks/us)) = + * clock/second. Our precision is about 100 ppm. + */ + cpu_khz = div_sc32( 1000, tsc_quotient); + { + printk("Detected %u.%03u MHz processor.\n", + cpu_khz / 1000, cpu_khz % 1000); + } +#ifdef CONFIG_CPU_FREQ + /* + * P4 and above CPU TSC freq doesn't change when + * CPU frequency changes + */ + if ((boot_cpu_data.x86 >= 15) && + (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)) + variable_tsc = 0; + + cpufreq_register_notifier(&time_cpufreq_notifier_block, + CPUFREQ_TRANSITION_NOTIFIER); +#endif + return 0; + } + } + return -ENODEV; +} + +/************************************************************/ + +/* tsc timer_opts struct */ +struct timer_opts hrtimer_tsc = { + .name = "hrt_tsc", + .mark_offset = high_res_mark_offset_tsc, + .get_offset = do_highres_gettimeoffset, + .monotonic_clock = monotonic_clock_hr_tsc, + .delay = delay_tsc, +}; + +struct init_timer_opts __initdata hrtimer_tsc_init = { + .init = high_res_init_tsc, + .opts = &hrtimer_tsc, +}; Index: linux/arch/i386/kernel/timers/timer.c =================================================================== --- linux.orig/arch/i386/kernel/timers/timer.c +++ linux/arch/i386/kernel/timers/timer.c @@ -1,10 +1,12 @@ #include #include #include +#include #include #ifdef CONFIG_HPET_TIMER /* + * If high res, we put that first... * HPET memory read is slower than tsc reads, but is more dependable as it * always runs at constant frequency and reduces complexity due to * cpufreq. So, we prefer HPET timer to tsc based one. Also, we cannot use @@ -13,7 +15,17 @@ #endif /* list of timers, ordered by preference, NULL terminated */ static struct init_timer_opts* __initdata timers[] = { +#ifdef CONFIG_HIGH_RES_TIMERS +#ifdef CONFIG_HIGH_RES_TIMER_ACPI_PM + &hrtimer_pm_init, +#elif CONFIG_HIGH_RES_TIMER_TSC + &hrtimer_tsc_init, +#endif /* CONFIG_HIGH_RES_TIMER_ACPI_PM */ +#endif #ifdef CONFIG_X86_CYCLONE_TIMER +#ifdef CONFIG_HIGH_RES_TIMERS +#error "The High Res Timers option is incompatable with the Cyclone timer" +#endif &timer_cyclone_init, #endif #ifdef CONFIG_HPET_TIMER @@ -28,6 +40,7 @@ static struct init_timer_opts* __initdat }; static char clock_override[10] __initdata; +#ifndef CONFIG_HIGH_RES_TIMERS_try static int __init clock_setup(char* str) { @@ -45,7 +58,7 @@ void clock_fallback(void) { cur_timer = &timer_pit; } - +#endif /* iterates through the list of timers, returning the first * one that initializes successfully. */ Index: linux/arch/i386/kernel/timers/timer_cyclone.c =================================================================== --- linux.orig/arch/i386/kernel/timers/timer_cyclone.c +++ linux/arch/i386/kernel/timers/timer_cyclone.c @@ -36,7 +36,7 @@ static u32* volatile cyclone_timer; /* C static u32 last_cyclone_low; static u32 last_cyclone_high; static unsigned long long monotonic_base; -static seqlock_t monotonic_lock = SEQLOCK_UNLOCKED; +static DECLARE_RAW_SEQLOCK(monotonic_lock); /* helper macro to atomically read both cyclone counter registers */ #define read_cyclone_counter(low,high) \ Index: linux/arch/i386/kernel/timers/timer_hpet.c =================================================================== --- linux.orig/arch/i386/kernel/timers/timer_hpet.c +++ linux/arch/i386/kernel/timers/timer_hpet.c @@ -24,7 +24,7 @@ static unsigned long hpet_last; /* hpet static unsigned long last_tsc_low; /* lsb 32 bits of Time Stamp Counter */ static unsigned long last_tsc_high; /* msb 32 bits of Time Stamp Counter */ static unsigned long long monoto