Index: linux/net/core/netpoll.c =================================================================== --- linux.orig/net/core/netpoll.c 2005-06-06 10:18:16.000000000 -0300 +++ linux/net/core/netpoll.c 2005-06-06 16:04:14.000000000 -0300 @@ -152,7 +152,9 @@ return; /* Process pending work on NIC */ + WARN_ON_RT(irqs_disabled()); np->dev->poll_controller(np->dev); + WARN_ON_RT(irqs_disabled()); if (np->dev->poll) poll_napi(np); @@ -179,28 +181,31 @@ static void zap_completion_queue(void) { - unsigned long flags; struct softnet_data *sd = &get_cpu_var(softnet_data); + struct sk_buff *clist = NULL; + unsigned long flags; if (sd->completion_queue) { - struct sk_buff *clist; - local_irq_save(flags); clist = sd->completion_queue; sd->completion_queue = NULL; local_irq_restore(flags); - - while (clist != NULL) { - struct sk_buff *skb = clist; - clist = clist->next; - if(skb->destructor) - dev_kfree_skb_any(skb); /* put this one back */ - else - __kfree_skb(skb); - } } + /* + * Took the list private, can drop our softnet + * reference: + */ put_cpu_var(softnet_data); + + while (clist != NULL) { + struct sk_buff *skb = clist; + clist = clist->next; + if(skb->destructor) + dev_kfree_skb_any(skb); /* put this one back */ + else + __kfree_skb(skb); + } } static struct sk_buff * find_skb(struct netpoll *np, int len, int reserve) @@ -263,7 +268,7 @@ } spin_lock(&np->dev->xmit_lock); - np->dev->xmit_lock_owner = smp_processor_id(); + np->dev->xmit_lock_owner = _smp_processor_id(); /* * network drivers do not expect to be called if the queue is Index: linux/net/core/dev.c =================================================================== --- linux.orig/net/core/dev.c 2005-06-06 10:18:16.000000000 -0300 +++ linux/net/core/dev.c 2005-06-06 16:04:14.000000000 -0300 @@ -1306,10 +1306,16 @@ Either shot noqueue qdisc, it is even simpler 8) */ if (dev->flags & IFF_UP) { - int cpu = smp_processor_id(); /* ok because BHs are off */ + int cpu = _smp_processor_id(); /* ok because BHs are off */ + /* + * No need to check for recursion with threaded interrupts: + */ +#ifdef CONFIG_PREEMPT_RT + if (1) { +#else if (dev->xmit_lock_owner != cpu) { - +#endif HARD_TX_LOCK(dev, cpu); if (!netif_queue_stopped(dev)) { @@ -1534,6 +1540,11 @@ BUG_TRAP(!atomic_read(&skb->users)); __kfree_skb(skb); + /* + * Safe to reschedule - the list is private + * at this point. + */ + cond_resched_all(); } } @@ -1556,10 +1567,17 @@ qdisc_run(dev); spin_unlock(&dev->queue_lock); } else { - netif_schedule(dev); + /* + * Dont re-kick the queue here, it will cause + * excessive scheduling of ksoftirqd due + * to retry. When the queue is released + * it will be completed anyway. + */ +// netif_schedule(dev); } } } + } static __inline__ int deliver_skb(struct sk_buff *skb, @@ -1775,12 +1793,13 @@ static void net_rx_action(struct softirq_action *h) { - struct softnet_data *queue = &__get_cpu_var(softnet_data); + struct softnet_data *queue; unsigned long start_time = jiffies; int budget = netdev_max_backlog; local_irq_disable(); + queue = &__get_cpu_var(softnet_data); while (!list_empty(&queue->poll_list)) { struct net_device *dev; @@ -1789,6 +1808,10 @@ goto softnet_break; local_irq_enable(); + if (unlikely(cond_resched_all())) { + local_irq_disable(); + continue; + } dev = list_entry(queue->poll_list.next, struct net_device, poll_list); @@ -1814,8 +1837,10 @@ return; softnet_break: + preempt_disable(); __get_cpu_var(netdev_rx_stat).time_squeeze++; __raise_softirq_irqoff(NET_RX_SOFTIRQ); + preempt_enable(); goto out; } Index: linux/net/sched/sch_generic.c =================================================================== --- linux.orig/net/sched/sch_generic.c 2005-06-06 10:18:16.000000000 -0300 +++ linux/net/sched/sch_generic.c 2005-06-06 16:04:14.000000000 -0300 @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -108,6 +109,10 @@ * will be requeued. */ if (!nolock) { +#ifdef CONFIG_PREEMPT_RT + spin_lock(&dev->xmit_lock); + dev->xmit_lock_owner = _smp_processor_id(); +#else if (!spin_trylock(&dev->xmit_lock)) { collision: /* So, someone grabbed the driver. */ @@ -117,17 +122,19 @@ it by checking xmit owner and drop the packet when deadloop is detected. */ - if (dev->xmit_lock_owner == smp_processor_id()) { + if (dev->xmit_lock_owner == _smp_processor_id()) { kfree_skb(skb); if (net_ratelimit()) printk(KERN_DEBUG "Dead loop on netdevice %s, fix it urgently!\n", dev->name); + return -1; } __get_cpu_var(netdev_rx_stat).cpu_collision++; goto requeue; } /* Remember that the driver is grabbed by us. */ - dev->xmit_lock_owner = smp_processor_id(); + dev->xmit_lock_owner = _smp_processor_id(); +#endif } { @@ -139,18 +146,34 @@ if (netdev_nit) dev_queue_xmit_nit(skb, dev); + WARN_ON_RT(irqs_disabled()); ret = dev->hard_start_xmit(skb, dev); +#ifdef CONFIG_PREEMPT_RT + if (irqs_disabled()) { + if (printk_ratelimit()) + print_symbol("network driver disabled interrupts: %s\n", (unsigned long)dev->hard_start_xmit); + local_irq_enable(); + } +#endif if (ret == NETDEV_TX_OK) { if (!nolock) { dev->xmit_lock_owner = -1; spin_unlock(&dev->xmit_lock); } spin_lock(&dev->queue_lock); +#if defined(CONFIG_PREEMPT_RT) && 0 + preempt_disable(); + __get_cpu_var(netdev_rx_stat).cpu_collision++; + preempt_enable(); + goto requeue; +#else return -1; +#endif } if (ret == NETDEV_TX_LOCKED && nolock) { spin_lock(&dev->queue_lock); - goto collision; +// ugh, is this right. goto collision; + return -1; } } @@ -174,12 +197,16 @@ 3. device is buggy (ppp) */ +#ifndef CONFIG_PREEMPT_RT requeue: +#endif q->ops->requeue(skb, q); netif_schedule(dev); + return 1; } BUG_ON((int) q->q.qlen < 0); + return q->q.qlen; } Index: linux/net/sunrpc/sched.c =================================================================== --- linux.orig/net/sunrpc/sched.c 2005-06-06 10:18:16.000000000 -0300 +++ linux/net/sunrpc/sched.c 2005-06-06 16:04:14.000000000 -0300 @@ -135,8 +135,6 @@ static void rpc_delete_timer(struct rpc_task *task) { - if (RPC_IS_QUEUED(task)) - return; if (test_and_clear_bit(RPC_TASK_HAS_TIMER, &task->tk_runstate)) { del_singleshot_timer_sync(&task->tk_timer); dprintk("RPC: %4d deleting timer\n", task->tk_pid); @@ -337,6 +335,8 @@ void rpc_sleep_on(struct rpc_wait_queue *q, struct rpc_task *task, rpc_action action, rpc_action timer) { + BUG_ON(test_bit(RPC_TASK_HAS_TIMER, &task->tk_runstate) != 0 || + timer_pending(&task->tk_timer)); /* * Protect the queue operations. */ @@ -566,7 +566,6 @@ BUG_ON(RPC_IS_QUEUED(task)); - restarted: while (1) { /* * Garbage collection of pending timers... @@ -594,6 +593,8 @@ unlock_kernel(); } + BUG_ON(test_bit(RPC_TASK_HAS_TIMER, &task->tk_runstate) != 0 || + timer_pending(&task->tk_timer)); /* * Perform the next FSM step. * tk_action may be NULL when the task has been killed @@ -607,6 +608,7 @@ unlock_kernel(); } + restarted: /* * Lockless check for whether task is sleeping or not. */ @@ -925,6 +927,8 @@ void rpc_run_child(struct rpc_task *task, struct rpc_task *child, rpc_action func) { + BUG_ON(test_bit(RPC_TASK_HAS_TIMER, &task->tk_runstate) != 0 || + timer_pending(&task->tk_timer)); spin_lock_bh(&childq.lock); /* N.B. Is it possible for the child to have already finished? */ __rpc_sleep_on(&childq, task, func, NULL); Index: linux/net/sunrpc/clnt.c =================================================================== --- linux.orig/net/sunrpc/clnt.c 2005-06-06 10:18:16.000000000 -0300 +++ linux/net/sunrpc/clnt.c 2005-06-06 16:04:14.000000000 -0300 @@ -232,7 +232,8 @@ clnt->cl_oneshot = 0; clnt->cl_dead = 0; rpc_killall_tasks(clnt); - sleep_on_timeout(&destroy_wait, 1*HZ); + wait_event_timeout(destroy_wait, + atomic_read(&clnt->cl_users) > 0, 1*HZ); } if (atomic_read(&clnt->cl_users) < 0) { Index: linux/net/ipv4/netfilter/ip_tables.c =================================================================== --- linux.orig/net/ipv4/netfilter/ip_tables.c 2005-06-06 10:18:16.000000000 -0300 +++ linux/net/ipv4/netfilter/ip_tables.c 2005-06-06 16:04:14.000000000 -0300 @@ -111,7 +111,11 @@ static LIST_HEAD(ipt_target); static LIST_HEAD(ipt_match); static LIST_HEAD(ipt_tables); -#define ADD_COUNTER(c,b,p) do { (c).bcnt += (b); (c).pcnt += (p); } while(0) +/* + * Use atomic add because on PREEMPT_RT the same table might + * be used on two CPUs at once: + */ +#define ADD_COUNTER(c,b,p) do { atomic_add((b), (atomic_t *)(&(c).bcnt)); atomic_add((p), (atomic_t *)(&(c).pcnt)); } while(0) #ifdef CONFIG_SMP #define TABLE_OFFSET(t,p) (SMP_ALIGN((t)->size)*(p)) @@ -290,8 +294,17 @@ read_lock_bh(&table->lock); IP_NF_ASSERT(table->valid_hooks & (1 << hook)); + /* + * on a PREEMPT_RT kernel the task could schedule + * off and smp_processor_id() is not safe. So we take + * the current value of the CPU and use that table. We + * only update the counters while read-locking the table + * and dont change the rules so the possibility of the + * same table being used by two tasks at once is not a + * problem. + */ table_base = (void *)table->private->entries - + TABLE_OFFSET(table->private, smp_processor_id()); + + TABLE_OFFSET(table->private, _smp_processor_id()); e = get_entry(table_base, table->private->hook_entry[hook]); #ifdef CONFIG_NETFILTER_DEBUG @@ -299,7 +312,7 @@ if (((struct ipt_entry *)table_base)->comefrom != 0xdead57ac && ((struct ipt_entry *)table_base)->comefrom != 0xeeeeeeec) { printk("ASSERT: CPU #%u, %s comefrom(%p) = %X\n", - smp_processor_id(), + _smp_processor_id(), table->name, &((struct ipt_entry *)table_base)->comefrom, ((struct ipt_entry *)table_base)->comefrom); Index: linux/sound/core/pcm_lib.c =================================================================== --- linux.orig/sound/core/pcm_lib.c 2005-06-06 10:18:16.000000000 -0300 +++ linux/sound/core/pcm_lib.c 2005-06-06 16:04:14.000000000 -0300 @@ -133,6 +133,7 @@ snd_pcm_stop(substream, SNDRV_PCM_STATE_XRUN); #ifdef CONFIG_SND_DEBUG if (substream->pstr->xrun_debug) { + user_trace_stop(); snd_printd(KERN_DEBUG "XRUN: pcmC%dD%d%c\n", substream->pcm->card->number, substream->pcm->device, Index: linux/fs/xfs/linux-2.6/mutex.h =================================================================== --- linux.orig/fs/xfs/linux-2.6/mutex.h 2005-06-06 10:18:16.000000000 -0300 +++ linux/fs/xfs/linux-2.6/mutex.h 2005-06-06 16:04:14.000000000 -0300 @@ -42,7 +42,7 @@ * callers. */ #define MUTEX_DEFAULT 0x0 -typedef struct semaphore mutex_t; +typedef struct compat_semaphore mutex_t; #define mutex_init(lock, type, name) sema_init(lock, 1) #define mutex_destroy(lock) sema_init(lock, -99) Index: linux/fs/xfs/linux-2.6/xfs_buf.c =================================================================== --- linux.orig/fs/xfs/linux-2.6/xfs_buf.c 2005-06-06 10:18:16.000000000 -0300 +++ linux/fs/xfs/linux-2.6/xfs_buf.c 2005-06-06 16:04:14.000000000 -0300 @@ -976,7 +976,7 @@ pagebuf_lock_value( xfs_buf_t *pb) { - return(atomic_read(&pb->pb_sema.count)); + return !sem_is_locked(&pb->pb_sema); } #endif Index: linux/fs/xfs/linux-2.6/xfs_buf.h =================================================================== --- linux.orig/fs/xfs/linux-2.6/xfs_buf.h 2005-06-06 10:18:16.000000000 -0300 +++ linux/fs/xfs/linux-2.6/xfs_buf.h 2005-06-06 16:04:14.000000000 -0300 @@ -138,7 +138,7 @@ #define PB_PAGES 2 typedef struct xfs_buf { - struct semaphore pb_sema; /* semaphore for lockables */ + struct compat_semaphore pb_sema; /* semaphore for lockables */ unsigned long pb_queuetime; /* time buffer was queued */ atomic_t pb_pin_count; /* pin count */ wait_queue_head_t pb_waiters; /* unpin waiters */ @@ -158,7 +158,7 @@ page_buf_iodone_t pb_iodone; /* I/O completion function */ page_buf_relse_t pb_relse; /* releasing function */ page_buf_bdstrat_t pb_strat; /* pre-write function */ - struct semaphore pb_iodonesema; /* Semaphore for I/O waiters */ + struct compat_semaphore pb_iodonesema; /* Semaphore for I/O waiters */ void *pb_fspriv; void *pb_fspriv2; void *pb_fspriv3; Index: linux/fs/xfs/linux-2.6/mrlock.h =================================================================== --- linux.orig/fs/xfs/linux-2.6/mrlock.h 2005-06-06 10:18:16.000000000 -0300 +++ linux/fs/xfs/linux-2.6/mrlock.h 2005-06-06 16:04:14.000000000 -0300 @@ -37,12 +37,12 @@ enum { MR_NONE, MR_ACCESS, MR_UPDATE }; typedef struct { - struct rw_semaphore mr_lock; - int mr_writer; + struct compat_rw_semaphore mr_lock; + int mr_writer; } mrlock_t; #define mrinit(mrp, name) \ - ( (mrp)->mr_writer = 0, init_rwsem(&(mrp)->mr_lock) ) + do { (mrp)->mr_writer = 0; init_rwsem(&(mrp)->mr_lock); } while (0) #define mrlock_init(mrp, t,n,s) mrinit(mrp, n) #define mrfree(mrp) do { } while (0) #define mraccess(mrp) mraccessf(mrp, 0) Index: linux/fs/xfs/linux-2.6/sema.h =================================================================== --- linux.orig/fs/xfs/linux-2.6/sema.h 2005-06-06 10:18:16.000000000 -0300 +++ linux/fs/xfs/linux-2.6/sema.h 2005-06-06 16:04:14.000000000 -0300 @@ -41,7 +41,7 @@ * sema_t structure just maps to struct semaphore in Linux kernel. */ -typedef struct semaphore sema_t; +typedef struct compat_semaphore sema_t; #define init_sema(sp, val, c, d) sema_init(sp, val) #define initsema(sp, val) sema_init(sp, val) Index: linux/fs/xfs/xfs_mount.h =================================================================== --- linux.orig/fs/xfs/xfs_mount.h 2005-06-06 10:18:16.000000000 -0300 +++ linux/fs/xfs/xfs_mount.h 2005-06-06 16:04:14.000000000 -0300 @@ -340,7 +340,7 @@ uint m_bm_maxlevels[2]; /* XFS_BM_MAXLEVELS */ uint m_in_maxlevels; /* XFS_IN_MAXLEVELS */ struct xfs_perag *m_perag; /* per-ag accounting info */ - struct rw_semaphore m_peraglock; /* lock for m_perag (pointer) */ + struct compat_rw_semaphore m_peraglock; /* lock for m_perag (pointer) */ sema_t m_growlock; /* growfs mutex */ int m_fixedfsid[2]; /* unchanged for life of FS */ uint m_dmevmask; /* DMI events for this FS */ Index: linux/fs/xfs/quota/xfs_qm.h =================================================================== --- linux.orig/fs/xfs/quota/xfs_qm.h 2005-06-06 10:18:16.000000000 -0300 +++ linux/fs/xfs/quota/xfs_qm.h 2005-06-06 16:04:14.000000000 -0300 @@ -177,8 +177,8 @@ #define XFS_QM_BWARNLIMIT 5 #define XFS_QM_IWARNLIMIT 5 -#define XFS_QM_LOCK(xqm) (mutex_lock(&xqm##_lock, PINOD)) -#define XFS_QM_UNLOCK(xqm) (mutex_unlock(&xqm##_lock)) +#define XFS_QM_LOCK(xqm) mutex_lock(&xqm##_lock, PINOD) +#define XFS_QM_UNLOCK(xqm) mutex_unlock(&xqm##_lock) #define XFS_QM_HOLD(xqm) ((xqm)->qm_nrefs++) #define XFS_QM_RELE(xqm) ((xqm)->qm_nrefs--) Index: linux/fs/xfs/quota/xfs_quota_priv.h =================================================================== --- linux.orig/fs/xfs/quota/xfs_quota_priv.h 2005-06-06 10:18:16.000000000 -0300 +++ linux/fs/xfs/quota/xfs_quota_priv.h 2005-06-06 16:04:14.000000000 -0300 @@ -64,8 +64,8 @@ #define XFS_QI_MPLNEXT(mp) ((mp)->m_quotainfo->qi_dqlist.qh_next) #define XFS_QI_MPLNDQUOTS(mp) ((mp)->m_quotainfo->qi_dqlist.qh_nelems) -#define XQMLCK(h) (mutex_lock(&((h)->qh_lock), PINOD)) -#define XQMUNLCK(h) (mutex_unlock(&((h)->qh_lock))) +#define XQMLCK(h) mutex_lock(&((h)->qh_lock), PINOD) +#define XQMUNLCK(h) mutex_unlock(&((h)->qh_lock)) #ifdef DEBUG struct xfs_dqhash; static inline int XQMISLCKD(struct xfs_dqhash *h) Index: linux/fs/proc/array.c =================================================================== --- linux.orig/fs/proc/array.c 2005-06-06 10:18:16.000000000 -0300 +++ linux/fs/proc/array.c 2005-06-06 16:04:14.000000000 -0300 @@ -129,17 +129,19 @@ */ static const char *task_state_array[] = { "R (running)", /* 0 */ - "S (sleeping)", /* 1 */ - "D (disk sleep)", /* 2 */ - "T (stopped)", /* 4 */ - "T (tracing stop)", /* 8 */ - "Z (zombie)", /* 16 */ - "X (dead)" /* 32 */ + "M (running-mutex)", /* 1 */ + "S (sleeping)", /* 2 */ + "D (disk sleep)", /* 4 */ + "T (stopped)", /* 8 */ + "T (tracing stop)", /* 16 */ + "Z (zombie)", /* 32 */ + "X (dead)" /* 64 */ }; static inline const char * get_task_state(struct task_struct *tsk) { unsigned int state = (tsk->state & (TASK_RUNNING | + TASK_RUNNING_MUTEX | TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE | TASK_STOPPED | Index: linux/fs/proc/proc_misc.c =================================================================== --- linux.orig/fs/proc/proc_misc.c 2005-06-06 10:18:16.000000000 -0300 +++ linux/fs/proc/proc_misc.c 2005-06-06 16:04:14.000000000 -0300 @@ -401,6 +401,41 @@ nr_running(), nr_iowait()); +#ifdef CONFIG_PREEMPT_RT + { + unsigned long nr_uninterruptible_cpu(int cpu); + extern int pi_walk, pi_null, pi_prio; + extern int rt_overload_schedule, + rt_overload_wakeup, rt_overload_pulled; + unsigned long rt_nr_running_cpu(int cpu); + extern atomic_t rt_overload; + + int i; + + seq_printf(p, "rt_overload_schedule: %d\n", + rt_overload_schedule); + seq_printf(p, "rt_overload_wakeup: %d\n", + rt_overload_wakeup); + seq_printf(p, "rt_overload_pulled: %d\n", + rt_overload_pulled); + seq_printf(p, "pi_null: %d\n", pi_null); + seq_printf(p, "pi_prio: %d\n", pi_prio); + seq_printf(p, "pi_walk: %d\n", pi_walk); + seq_printf(p, "nr_running(): %ld\n", + nr_running()); + seq_printf(p, "nr_uninterruptible(): %ld\n", + nr_uninterruptible()); + for_each_cpu(i) + seq_printf(p, "nr_uninterruptible(%d): %ld\n", + i, nr_uninterruptible_cpu(i)); + for_each_cpu(i) + seq_printf(p, "rt_nr_running(%d): %ld\n", + i, rt_nr_running_cpu(i)); + seq_printf(p, "rt_overload: %d\n", atomic_read(&rt_overload)); + + } +#endif + return 0; } @@ -517,6 +552,20 @@ return proc_calc_metrics(page, start, off, count, eof, len); } +#ifdef CONFIG_LATENCY_TRACE +extern struct seq_operations latency_trace_op; +static int latency_trace_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &latency_trace_op); +} +static struct file_operations proc_latency_trace_operations = { + .open = latency_trace_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; +#endif + #ifdef CONFIG_MAGIC_SYSRQ /* * writing 'C' to /proc/sysrq-trigger is like sysrq-C @@ -596,6 +645,9 @@ #ifdef CONFIG_SCHEDSTATS create_seq_entry("schedstat", 0, &proc_schedstat_operations); #endif +#ifdef CONFIG_LATENCY_TRACE + create_seq_entry("latency_trace", 0, &proc_latency_trace_operations); +#endif #ifdef CONFIG_PROC_KCORE proc_root_kcore = create_proc_entry("kcore", S_IRUSR, NULL); if (proc_root_kcore) { Index: linux/fs/proc/task_mmu.c =================================================================== --- linux.orig/fs/proc/task_mmu.c 2005-06-06 10:18:16.000000000 -0300 +++ linux/fs/proc/task_mmu.c 2005-06-06 16:04:14.000000000 -0300 @@ -184,8 +184,10 @@ map = NULL; if ((unsigned long)l < mm->map_count) { map = mm->mmap; - while (l-- && map) + while (l-- && map) { map = map->vm_next; + cond_resched(); + } goto out; } Index: linux/fs/nfsd/nfssvc.c =================================================================== --- linux.orig/fs/nfsd/nfssvc.c 2005-06-06 10:18:16.000000000 -0300 +++ linux/fs/nfsd/nfssvc.c 2005-06-06 16:04:14.000000000 -0300 @@ -285,6 +285,7 @@ /* Release the thread */ svc_exit_thread(rqstp); + unlock_kernel(); /* Release module */ module_put_and_exit(0); } Index: linux/fs/jbd/journal.c =================================================================== --- linux.orig/fs/jbd/journal.c 2005-06-06 10:18:16.000000000 -0300 +++ linux/fs/jbd/journal.c 2005-06-06 16:04:14.000000000 -0300 @@ -82,6 +82,14 @@ static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *); +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_PREEMPT) +spinlock_t journal_bh_state_lock = SPIN_LOCK_UNLOCKED; +spinlock_t journal_bh_journal_lock = SPIN_LOCK_UNLOCKED; + +EXPORT_SYMBOL(journal_bh_state_lock); +EXPORT_SYMBOL(journal_bh_journal_lock); +#endif + /* * Helper function used to manage commit timeouts */ Index: linux/fs/pipe.c =================================================================== --- linux.orig/fs/pipe.c 2005-06-06 10:18:16.000000000 -0300 +++ linux/fs/pipe.c 2005-06-06 16:04:14.000000000 -0300 @@ -202,8 +202,14 @@ wake_up_interruptible(PIPE_WAIT(*inode)); kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT); } + /* + * Hack: we turn off atime updates for -RT kernels. + * Who uses them on pipes anyway? + */ +#ifndef CONFIG_PREEMPT_RT if (ret > 0) file_accessed(filp); +#endif return ret; } @@ -342,8 +348,14 @@ wake_up_interruptible(PIPE_WAIT(*inode)); kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN); } + /* + * Hack: we turn off atime updates for -RT kernels. + * Who uses them on pipes anyway? + */ +#ifndef CONFIG_PREEMPT_RT if (ret > 0) inode_update_time(inode, 1); /* mtime and ctime */ +#endif return ret; } Index: linux/fs/nfs/inode.c =================================================================== --- linux.orig/fs/nfs/inode.c 2005-06-06 10:18:16.000000000 -0300 +++ linux/fs/nfs/inode.c 2005-06-06 16:04:14.000000000 -0300 @@ -118,7 +118,7 @@ int flags = sync ? FLUSH_WAIT : 0; int ret; - ret = nfs_commit_inode(inode, 0, 0, flags); + ret = nfs_commit_inode(inode, flags); if (ret < 0) return ret; return 0; Index: linux/fs/nfs/write.c =================================================================== --- linux.orig/fs/nfs/write.c 2005-06-06 10:18:16.000000000 -0300 +++ linux/fs/nfs/write.c 2005-06-06 16:04:14.000000000 -0300 @@ -352,7 +352,7 @@ if (err < 0) goto out; } - err = nfs_commit_inode(inode, 0, 0, wb_priority(wbc)); + err = nfs_commit_inode(inode, wb_priority(wbc)); if (err > 0) { wbc->nr_to_write -= err; err = 0; @@ -446,6 +446,8 @@ struct nfs_inode *nfsi = NFS_I(inode); spin_lock(&nfsi->req_lock); + radix_tree_tag_set(&nfsi->nfs_page_tree, + req->wb_index, NFS_PAGE_TAG_DIRTY); nfs_list_add_request(req, &nfsi->dirty); nfsi->ndirty++; spin_unlock(&nfsi->req_lock); @@ -503,13 +505,12 @@ spin_lock(&nfsi->req_lock); next = idx_start; - while (radix_tree_gang_lookup(&nfsi->nfs_page_tree, (void **)&req, next, 1)) { + while (radix_tree_gang_lookup_tag(&nfsi->nfs_page_tree, (void **)&req, next, 1, NFS_PAGE_TAG_WRITEBACK)) { if (req->wb_index > idx_end) break; next = req->wb_index + 1; - if (!NFS_WBACK_BUSY(req)) - continue; + BUG_ON(!NFS_WBACK_BUSY(req)); atomic_inc(&req->wb_count); spin_unlock(&nfsi->req_lock); @@ -538,12 +539,15 @@ nfs_scan_dirty(struct inode *inode, struct list_head *dst, unsigned long idx_start, unsigned int npages) { struct nfs_inode *nfsi = NFS_I(inode); - int res; - res = nfs_scan_list(&nfsi->dirty, dst, idx_start, npages); - nfsi->ndirty -= res; - sub_page_state(nr_dirty,res); - if ((nfsi->ndirty == 0) != list_empty(&nfsi->dirty)) - printk(KERN_ERR "NFS: desynchronized value of nfs_i.ndirty.\n"); + int res = 0; + + if (nfsi->ndirty != 0) { + res = nfs_scan_lock_dirty(nfsi, dst, idx_start, npages); + nfsi->ndirty -= res; + sub_page_state(nr_dirty,res); + if ((nfsi->ndirty == 0) != list_empty(&nfsi->dirty)) + printk(KERN_ERR "NFS: desynchronized value of nfs_i.ndirty.\n"); + } return res; } @@ -562,11 +566,14 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst, unsigned long idx_start, unsigned int npages) { struct nfs_inode *nfsi = NFS_I(inode); - int res; - res = nfs_scan_list(&nfsi->commit, dst, idx_start, npages); - nfsi->ncommit -= res; - if ((nfsi->ncommit == 0) != list_empty(&nfsi->commit)) - printk(KERN_ERR "NFS: desynchronized value of nfs_i.ncommit.\n"); + int res = 0; + + if (nfsi->ncommit != 0) { + res = nfs_scan_list(nfsi, &nfsi->commit, dst, idx_start, npages); + nfsi->ncommit -= res; + if ((nfsi->ncommit == 0) != list_empty(&nfsi->commit)) + printk(KERN_ERR "NFS: desynchronized value of nfs_i.ncommit.\n"); + } return res; } #endif @@ -821,7 +828,7 @@ #else nfs_inode_remove_request(req); #endif - nfs_unlock_request(req); + nfs_clear_page_writeback(req); } static inline int flush_task_priority(int how) @@ -952,7 +959,7 @@ nfs_writedata_free(data); } nfs_mark_request_dirty(req); - nfs_unlock_request(req); + nfs_clear_page_writeback(req); return -ENOMEM; } @@ -1002,7 +1009,7 @@ struct nfs_page *req = nfs_list_entry(head->next); nfs_list_remove_request(req); nfs_mark_request_dirty(req); - nfs_unlock_request(req); + nfs_clear_page_writeback(req); } return -ENOMEM; } @@ -1029,7 +1036,7 @@ req = nfs_list_entry(head->next); nfs_list_remove_request(req); nfs_mark_request_dirty(req); - nfs_unlock_request(req); + nfs_clear_page_writeback(req); } return error; } @@ -1121,7 +1128,7 @@ nfs_inode_remove_request(req); #endif next: - nfs_unlock_request(req); + nfs_clear_page_writeback(req); } } @@ -1210,36 +1217,24 @@ struct nfs_write_data *data, int how) { struct rpc_task *task = &data->task; - struct nfs_page *first, *last; + struct nfs_page *first; struct inode *inode; - loff_t start, end, len; /* Set up the RPC argument and reply structs * NB: take care not to mess about with data->commit et al. */ list_splice_init(head, &data->pages); first = nfs_list_entry(data->pages.next); - last = nfs_list_entry(data->pages.prev); inode = first->wb_context->dentry->d_inode; - /* - * Determine the offset range of requests in the COMMIT call. - * We rely on the fact that data->pages is an ordered list... - */ - start = req_offset(first); - end = req_offset(last) + last->wb_bytes; - len = end - start; - /* If 'len' is not a 32-bit quantity, pass '0' in the COMMIT call */ - if (end >= i_size_read(inode) || len < 0 || len > (~((u32)0) >> 1)) - len = 0; - data->inode = inode; data->cred = first->wb_context->cred; data->args.fh = NFS_FH(data->inode); - data->args.offset = start; - data->args.count = len; - data->res.count = len; + /* Note: we always request a commit of the entire inode */ + data->args.offset = 0; + data->args.count = 0; + data->res.count = 0; data->res.fattr = &data->fattr; data->res.verf = &data->verf; @@ -1278,7 +1273,7 @@ req = nfs_list_entry(head->next); nfs_list_remove_request(req); nfs_mark_request_commit(req); - nfs_unlock_request(req); + nfs_clear_page_writeback(req); } return -ENOMEM; } @@ -1324,7 +1319,7 @@ dprintk(" mismatch\n"); nfs_mark_request_dirty(req); next: - nfs_unlock_request(req); + nfs_clear_page_writeback(req); res++; } sub_page_state(nr_unstable,res); @@ -1350,8 +1345,7 @@ } #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) -int nfs_commit_inode(struct inode *inode, unsigned long idx_start, - unsigned int npages, int how) +int nfs_commit_inode(struct inode *inode, int how) { struct nfs_inode *nfsi = NFS_I(inode); LIST_HEAD(head); @@ -1359,15 +1353,13 @@ error = 0; spin_lock(&nfsi->req_lock); - res = nfs_scan_commit(inode, &head, idx_start, npages); + res = nfs_scan_commit(inode, &head, 0, 0); + spin_unlock(&nfsi->req_lock); if (res) { - res += nfs_scan_commit(inode, &head, 0, 0); - spin_unlock(&nfsi->req_lock); error = nfs_commit_list(&head, how); - } else - spin_unlock(&nfsi->req_lock); - if (error < 0) - return error; + if (error < 0) + return error; + } return res; } #endif @@ -1389,7 +1381,7 @@ error = nfs_flush_inode(inode, idx_start, npages, how); #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) if (error == 0) - error = nfs_commit_inode(inode, idx_start, npages, how); + error = nfs_commit_inode(inode, how); #endif } while (error > 0); return error; Index: linux/fs/nfs/read.c =================================================================== --- linux.orig/fs/nfs/read.c 2005-06-06 10:18:16.000000000 -0300 +++ linux/fs/nfs/read.c 2005-06-06 16:04:14.000000000 -0300 @@ -173,7 +173,6 @@ if (len < PAGE_CACHE_SIZE) memclear_highpage_flush(page, len, PAGE_CACHE_SIZE - len); - nfs_lock_request(new); nfs_list_add_request(new, &one_request); nfs_pagein_one(&one_request, inode); return 0; @@ -185,7 +184,6 @@ nfs_clear_request(req); nfs_release_request(req); - nfs_unlock_request(req); dprintk("NFS: read done (%s/%Ld %d@%Ld)\n", req->wb_context->dentry->d_inode->i_sb->s_id, @@ -553,7 +551,6 @@ } if (len < PAGE_CACHE_SIZE) memclear_highpage_flush(page, len, PAGE_CACHE_SIZE - len); - nfs_lock_request(new); nfs_list_add_request(new, desc->head); return 0; } Index: linux/fs/nfs/pagelist.c =================================================================== --- linux.orig/fs/nfs/pagelist.c 2005-06-06 10:18:16.000000000 -0300 +++ linux/fs/nfs/pagelist.c 2005-06-06 16:04:14.000000000 -0300 @@ -112,6 +112,33 @@ } /** + * nfs_set_page_writeback_locked - Lock a request for writeback + * @req: + */ +int nfs_set_page_writeback_locked(struct nfs_page *req) +{ + struct nfs_inode *nfsi = NFS_I(req->wb_context->dentry->d_inode); + + if (!nfs_lock_request(req)) + return 0; + radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_WRITEBACK); + return 1; +} + +/** + * nfs_clear_page_writeback - Unlock request and wake up sleepers + */ +void nfs_clear_page_writeback(struct nfs_page *req) +{ + struct nfs_inode *nfsi = NFS_I(req->wb_context->dentry->d_inode); + + spin_lock(&nfsi->req_lock); + radix_tree_tag_clear(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_WRITEBACK); + spin_unlock(&nfsi->req_lock); + nfs_unlock_request(req); +} + +/** * nfs_clear_request - Free up all resources allocated to the request * @req: * @@ -151,36 +178,6 @@ } /** - * nfs_list_add_request - Insert a request into a sorted list - * @req: request - * @head: head of list into which to insert the request. - * - * Note that the wb_list is sorted by page index in order to facilitate - * coalescing of requests. - * We use an insertion sort that is optimized for the case of appended - * writes. - */ -void -nfs_list_add_request(struct nfs_page *req, struct list_head *head) -{ - struct list_head *pos; - -#ifdef NFS_PARANOIA - if (!list_empty(&req->wb_list)) { - printk(KERN_ERR "NFS: Add to list failed!\n"); - BUG(); - } -#endif - list_for_each_prev(pos, head) { - struct nfs_page *p = nfs_list_entry(pos); - if (p->wb_index < req->wb_index) - break; - } - list_add(&req->wb_list, pos); - req->wb_list_head = head; -} - -/** * nfs_wait_on_request - Wait for a request to complete. * @req: request to wait upon. * @@ -243,6 +240,63 @@ return npages; } +#define NFS_SCAN_MAXENTRIES 16 +/** + * nfs_scan_lock_dirty - Scan the radix tree for dirty requests + * @nfsi: NFS inode + * @dst: Destination list + * @idx_start: lower bound of page->index to scan + * @npages: idx_start + npages sets the upper bound to scan. + * + * Moves elements from one of the inode request lists. + * If the number of requests is set to 0, the entire address_space + * starting at index idx_start, is scanned. + * The requests are *not* checked to ensure that they form a contiguous set. + * You must be holding the inode's req_lock when calling this function + */ +int +nfs_scan_lock_dirty(struct nfs_inode *nfsi, struct list_head *dst, + unsigned long idx_start, unsigned int npages) +{ + struct nfs_page *pgvec[NFS_SCAN_MAXENTRIES]; + struct nfs_page *req; + unsigned long idx_end; + int found, i; + int res; + + res = 0; + if (npages == 0) + idx_end = ~0; + else + idx_end = idx_start + npages - 1; + + for (;;) { + found = radix_tree_gang_lookup_tag(&nfsi->nfs_page_tree, + (void **)&pgvec[0], idx_start, NFS_SCAN_MAXENTRIES, + NFS_PAGE_TAG_DIRTY); + if (found <= 0) + break; + for (i = 0; i < found; i++) { + req = pgvec[i]; + if (req->wb_index > idx_end) + goto out; + + idx_start = req->wb_index + 1; + + if (nfs_set_page_writeback_locked(req)) { + radix_tree_tag_clear(&nfsi->nfs_page_tree, + req->wb_index, NFS_PAGE_TAG_DIRTY); + nfs_list_remove_request(req); + nfs_list_add_request(req, dst); + res++; + } + } + cond_resched_lock(&nfsi->req_lock); + } +out: + return res; +} + /** * nfs_scan_list - Scan a list for matching requests * @head: One of the NFS inode request lists @@ -257,10 +311,12 @@ * You must be holding the inode's req_lock when calling this function */ int -nfs_scan_list(struct list_head *head, struct list_head *dst, - unsigned long idx_start, unsigned int npages) +nfs_scan_list(struct nfs_inode *nfsi, struct list_head *head, + struct list_head *dst, unsigned long idx_start, + unsigned int npages) { - struct list_head *pos, *tmp; + LIST_HEAD(locked); + struct list_head *pos; struct nfs_page *req; unsigned long idx_end; int res; @@ -271,21 +327,22 @@ else idx_end = idx_start + npages - 1; - list_for_each_safe(pos, tmp, head) { + while (!list_empty(head)) { + pos = head->next; req = nfs_list_entry(pos); - if (req->wb_index < idx_start) - continue; - if (req->wb_index > idx_end) - break; - - if (!nfs_lock_request(req)) - continue; - nfs_list_remove_request(req); - nfs_list_add_request(req, dst); - res++; + if (!nfs_set_page_writeback_locked(req)) { + list_del(pos); + list_add(&req->wb_list, &locked); + } else { + nfs_list_remove_request(req); + nfs_list_add_request(req, dst); + res++; + } + cond_resched_lock(&nfsi->req_lock); } + list_splice(&locked, head); return res; } Index: linux/fs/lockd/svc.c =================================================================== --- linux.orig/fs/lockd/svc.c 2005-06-06 10:18:16.000000000 -0300 +++ linux/fs/lockd/svc.c 2005-06-06 16:04:14.000000000 -0300 @@ -49,7 +49,7 @@ int nlmsvc_grace_period; unsigned long nlmsvc_timeout; -static DECLARE_MUTEX_LOCKED(lockd_start); +static DECLARE_WAIT_QUEUE_HEAD(lockd_start); static DECLARE_WAIT_QUEUE_HEAD(lockd_exit); /* @@ -112,7 +112,7 @@ * Let our maker know we're running. */ nlmsvc_pid = current->pid; - up(&lockd_start); + wake_up(&lockd_start); daemonize("lockd"); @@ -261,8 +261,15 @@ "lockd_up: create thread failed, error=%d\n", error); goto destroy_and_out; } - down(&lockd_start); - + /* + * Wait for the lockd process to start, but since we're holding + * the lockd semaphore, we can't wait around forever ... + */ + if (wait_event_interruptible_timeout(lockd_start, + nlmsvc_pid != 0, HZ) <= 0) { + printk(KERN_WARNING + "lockd_down: lockd failed to start\n"); + } /* * Note: svc_serv structures have an initial use count of 1, * so we exit through here on both success and failure. @@ -302,16 +309,12 @@ * Wait for the lockd process to exit, but since we're holding * the lockd semaphore, we can't wait around forever ... */ - clear_thread_flag(TIF_SIGPENDING); - interruptible_sleep_on_timeout(&lockd_exit, HZ); - if (nlmsvc_pid) { + if (wait_event_interruptible_timeout(lockd_exit, + nlmsvc_pid == 0, HZ) <= 0) { printk(KERN_WARNING "lockd_down: lockd failed to exit, clearing pid\n"); nlmsvc_pid = 0; } - spin_lock_irq(¤t->sighand->siglock); - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); out: up(&nlmsvc_sema); } Index: linux/fs/fcntl.c =================================================================== --- linux.orig/fs/fcntl.c 2005-06-06 10:18:16.000000000 -0300 +++ linux/fs/fcntl.c 2005-06-06 16:04:14.000000000 -0300 @@ -442,7 +442,8 @@ break; /* fall-through: fall back on the old plain SIGIO signal */ case 0: - send_group_sig_info(SIGIO, SEND_SIG_PRIV, p); + // we hold the tasklist lock already: + group_send_sig_info(SIGIO, SEND_SIG_PRIV, p); } } @@ -476,7 +477,7 @@ struct fown_struct *fown) { if (sigio_perm(p, fown, SIGURG)) - send_group_sig_info(SIGURG, SEND_SIG_PRIV, p); + group_send_sig_info(SIGURG, SEND_SIG_PRIV, p); } int send_sigurg(struct fown_struct *fown) Index: linux/fs/ext3/balloc.c =================================================================== --- linux.orig/fs/ext3/balloc.c 2005-06-06 10:18:16.000000000 -0300 +++ linux/fs/ext3/balloc.c 2005-06-06 16:04:14.000000000 -0300 @@ -749,24 +749,24 @@ * to find a free region that is of my size and has not * been reserved. * - * on succeed, it returns the reservation window to be appended to. - * failed, return NULL. */ -static struct ext3_reserve_window_node *find_next_reservable_window( +static int find_next_reservable_window( struct ext3_reserve_window_node *search_head, - unsigned long size, int *start_block, + struct ext3_reserve_window_node *my_rsv, + struct super_block * sb, int start_block, int last_block) { struct rb_node *next; struct ext3_reserve_window_node *rsv, *prev; int cur; + int size = my_rsv->rsv_goal_size; /* TODO: make the start of the reservation window byte-aligned */ /* cur = *start_block & ~7;*/ - cur = *start_block; + cur = start_block; rsv = search_head; if (!rsv) - return NULL; + return -1; while (1) { if (cur <= rsv->rsv_end) @@ -782,7 +782,7 @@ * space with expected-size (or more)... */ if (cur > last_block) - return NULL; /* fail */ + return -1; /* fail */ prev = rsv; next = rb_next(&rsv->rsv_node); @@ -813,8 +813,26 @@ * return the reservation window that we could append to. * succeed. */ - *start_block = cur; - return prev; + + if ((prev != my_rsv) && (!rsv_is_empty(&my_rsv->rsv_window))) + rsv_window_remove(sb, my_rsv); + + /* let's book the whole avaliable window for now + * we will check the + * disk bitmap later and then, if there are free block + * then we adjust the window size if the it's + * larger than requested. + * Otherwise, we will remove this node from the tree next time + * call find_next_reservable_window. + */ + my_rsv->rsv_start = cur; + my_rsv->rsv_end = cur + size - 1; + my_rsv->rsv_alloc_hit = 0; + + if (prev != my_rsv) + ext3_rsv_window_add(sb, my_rsv); + + return 0; } /** @@ -852,6 +870,7 @@ * @sb: the super block * @group: the group we are trying to allocate in * @bitmap_bh: the block group block bitmap + * */ static int alloc_new_reservation(struct ext3_reserve_window_node *my_rsv, int goal, struct super_block *sb, @@ -860,10 +879,10 @@ struct ext3_reserve_window_node *search_head; int group_first_block, group_end_block, start_block; int first_free_block; - int reservable_space_start; - struct ext3_reserve_window_node *prev_rsv; struct rb_root *fs_rsv_root = &EXT3_SB(sb)->s_rsv_window_root; unsigned long size; + int ret; + spinlock_t *rsv_lock = &EXT3_SB(sb)->s_rsv_window_lock; group_first_block = le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block) + group * EXT3_BLOCKS_PER_GROUP(sb); @@ -875,6 +894,7 @@ start_block = goal + group_first_block; size = my_rsv->rsv_goal_size; + if (!rsv_is_empty(&my_rsv->rsv_window)) { /* * if the old reservation is cross group boundary @@ -908,6 +928,8 @@ my_rsv->rsv_goal_size= size; } } + + spin_lock(rsv_lock); /* * shift the search start to the window near the goal block */ @@ -921,11 +943,16 @@ * need to check the bitmap after we found a reservable window. */ retry: - prev_rsv = find_next_reservable_window(search_head, size, - &start_block, group_end_block); - if (prev_rsv == NULL) - goto failed; - reservable_space_start = start_block; + ret = find_next_reservable_window(search_head, my_rsv, sb, + start_block, group_end_block); + + if (ret == -1) { + if (!rsv_is_empty(&my_rsv->rsv_window)) + rsv_window_remove(sb, my_rsv); + spin_unlock(rsv_lock); + return -1; + } + /* * On success, find_next_reservable_window() returns the * reservation window where there is a reservable space after it. @@ -937,8 +964,9 @@ * block. Search start from the start block of the reservable space * we just found. */ + spin_unlock(rsv_lock); first_free_block = bitmap_search_next_usable_block( - reservable_space_start - group_first_block, + my_rsv->rsv_start - group_first_block, bitmap_bh, group_end_block - group_first_block + 1); if (first_free_block < 0) { @@ -946,54 +974,30 @@ * no free block left on the bitmap, no point * to reserve the space. return failed. */ - goto failed; + spin_lock(rsv_lock); + if (!rsv_is_empty(&my_rsv->rsv_window)) + rsv_window_remove(sb, my_rsv); + spin_unlock(rsv_lock); + return -1; /* failed */ } + start_block = first_free_block + group_first_block; /* * check if the first free block is within the - * free space we just found + * free space we just reserved */ - if ((start_block >= reservable_space_start) && - (start_block < reservable_space_start + size)) - goto found_rsv_window; + if ((start_block >= my_rsv->rsv_start) && + (start_block < my_rsv->rsv_end)) + return 0; /* succeed */ /* * if the first free bit we found is out of the reservable space - * this means there is no free block on the reservable space - * we should continue search for next reservable space, + * continue search for next reservable space, * start from where the free block is, * we also shift the list head to where we stopped last time */ - search_head = prev_rsv; + search_head = my_rsv; + spin_lock(rsv_lock); goto retry; - -found_rsv_window: - /* - * great! the reservable space contains some free blocks. - * if the search returns that we should add the new - * window just next to where the old window, we don't - * need to remove the old window first then add it to the - * same place, just update the new start and new end. - */ - if (my_rsv != prev_rsv) { - if (!rsv_is_empty(&my_rsv->rsv_window)) - rsv_window_remove(sb, my_rsv); - } - my_rsv->rsv_start = reservable_space_start; - my_rsv->rsv_end = my_rsv->rsv_start + size - 1; - my_rsv->rsv_alloc_hit = 0; - if (my_rsv != prev_rsv) { - ext3_rsv_window_add(sb, my_rsv); - } - return 0; /* succeed */ -failed: - /* - * failed to find a new reservation window in the current - * group, remove the current(stale) reservation window - * if there is any - */ - if (!rsv_is_empty(&my_rsv->rsv_window)) - rsv_window_remove(sb, my_rsv); - return -1; /* failed */ } /* @@ -1023,7 +1027,6 @@ int goal, struct ext3_reserve_window_node * my_rsv, int *errp) { - spinlock_t *rsv_lock; unsigned long group_first_block; int ret = 0; int fatal; @@ -1052,7 +1055,6 @@ ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh, goal, NULL); goto out; } - rsv_lock = &EXT3_SB(sb)->s_rsv_window_lock; /* * goal is a group relative block number (if there is a goal) * 0 < goal < EXT3_BLOCKS_PER_GROUP(sb) @@ -1078,30 +1080,21 @@ * then we could go to allocate from the reservation window directly. */ while (1) { - struct ext3_reserve_window rsv_copy; - - rsv_copy._rsv_start = my_rsv->rsv_start; - rsv_copy._rsv_end = my_rsv->rsv_end; - - if (rsv_is_empty(&rsv_copy) || (ret < 0) || - !goal_in_my_reservation(&rsv_copy, goal, group, sb)) { - spin_lock(rsv_lock); + if (rsv_is_empty(&my_rsv->rsv_window) || (ret < 0) || + !goal_in_my_reservation(&my_rsv->rsv_window, goal, group, sb)) { ret = alloc_new_reservation(my_rsv, goal, sb, group, bitmap_bh); - rsv_copy._rsv_start = my_rsv->rsv_start; - rsv_copy._rsv_end = my_rsv->rsv_end; - spin_unlock(rsv_lock); if (ret < 0) break; /* failed */ - if (!goal_in_my_reservation(&rsv_copy, goal, group, sb)) + if (!goal_in_my_reservation(&my_rsv->rsv_window, goal, group, sb)) goal = -1; } - if ((rsv_copy._rsv_start >= group_first_block + EXT3_BLOCKS_PER_GROUP(sb)) - || (rsv_copy._rsv_end < group_first_block)) + if ((my_rsv->rsv_start >= group_first_block + EXT3_BLOCKS_PER_GROUP(sb)) + || (my_rsv->rsv_end < group_first_block)) BUG(); ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh, goal, - &rsv_copy); + &my_rsv->rsv_window); if (ret >= 0) { my_rsv->rsv_alloc_hit++; break; /* succeed */ Index: linux/fs/ext3/file.c =================================================================== --- linux.orig/fs/ext3/file.c 2005-06-06 10:18:16.000000000 -0300 +++ linux/fs/ext3/file.c 2005-06-06 16:04:14.000000000 -0300 @@ -36,7 +36,11 @@ /* if we are the last writer on the inode, drop the block reservation */ if ((filp->f_mode & FMODE_WRITE) && (atomic_read(&inode->i_writecount) == 1)) + { + down(&EXT3_I(inode)->truncate_sem); ext3_discard_reservation(inode); + up(&EXT3_I(inode)->truncate_sem); + } if (is_dx(inode) && filp->private_data) ext3_htree_free_dir_info(filp->private_data); Index: linux/fs/exec.c =================================================================== --- linux.orig/fs/exec.c 2005-06-06 10:18:16.000000000 -0300 +++ linux/fs/exec.c 2005-06-06 16:04:14.000000000 -0300 @@ -48,6 +48,7 @@ #include #include #include +#include #include #include @@ -568,11 +569,16 @@ } } task_lock(tsk); + + local_irq_disable(); // FIXME active_mm = tsk->active_mm; + activate_mm(active_mm, mm); tsk->mm = mm; tsk->active_mm = mm; - activate_mm(active_mm, mm); + local_irq_enable(); + task_unlock(tsk); + arch_pick_mmap_layout(mm); if (old_mm) { up_read(&old_mm->mmap_sem); @@ -667,7 +673,7 @@ * of the time. */ while (leader->exit_state != EXIT_ZOMBIE) - yield(); + msleep(1); spin_lock(&leader->proc_lock); spin_lock(¤t->proc_lock); @@ -1416,9 +1422,6 @@ mm->core_waiters++; /* let other threads block */ mm->core_startup_done = &startup_done; - /* give other threads a chance to run: */ - yield(); - zap_threads(mm); if (--mm->core_waiters) { up_write(&mm->mmap_sem); Index: linux/fs/aio.c =================================================================== --- linux.orig/fs/aio.c 2005-06-06 10:18:16.000000000 -0300 +++ linux/fs/aio.c 2005-06-06 16:04:14.000000000 -0300 @@ -564,9 +564,11 @@ tsk->flags |= PF_BORROWED_MM; active_mm = tsk->active_mm; atomic_inc(&mm->mm_count); + local_irq_disable(); // FIXME + activate_mm(active_mm, mm); tsk->mm = mm; tsk->active_mm = mm; - activate_mm(active_mm, mm); + local_irq_enable(); task_unlock(tsk); mmdrop(active_mm); Index: linux/fs/dcache.c =================================================================== --- linux.orig/fs/dcache.c 2005-06-06 10:18:16.000000000 -0300 +++ linux/fs/dcache.c 2005-06-06 16:04:14.000000000 -0300 @@ -39,7 +39,7 @@ EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure); __cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_lock); -static seqlock_t rename_lock __cacheline_aligned_in_smp = SEQLOCK_UNLOCKED; +static DECLARE_SEQLOCK(rename_lock); EXPORT_SYMBOL(dcache_lock); Index: linux/mm/slab.c =================================================================== --- linux.orig/mm/slab.c 2005-06-06 10:18:16.000000000 -0300 +++ linux/mm/slab.c 2005-06-06 16:04:14.000000000 -0300 @@ -578,9 +578,9 @@ return (void**)(ac+1); } -static inline struct array_cache *ac_data(kmem_cache_t *cachep) +static inline struct array_cache *ac_data(kmem_cache_t *cachep, int cpu) { - return cachep->array[smp_processor_id()]; + return cachep->array[cpu]; } static inline kmem_cache_t *__find_general_cachep(size_t size, int gfpflags) @@ -833,21 +833,22 @@ /* 4) Replace the bootstrap head arrays */ { void * ptr; + int cpu = smp_processor_id(); ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); - local_irq_disable(); - BUG_ON(ac_data(&cache_cache) != &initarray_cache.cache); - memcpy(ptr, ac_data(&cache_cache), sizeof(struct arraycache_init)); - cache_cache.array[smp_processor_id()] = ptr; - local_irq_enable(); + local_irq_disable_nort(); + BUG_ON(ac_data(&cache_cache, cpu) != &initarray_cache.cache); + memcpy(ptr, ac_data(&cache_cache, cpu), sizeof(struct arraycache_init)); + cache_cache.array[cpu] = ptr; + local_irq_enable_nort(); ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); - local_irq_disable(); - BUG_ON(ac_data(malloc_sizes[0].cs_cachep) != &initarray_generic.cache); - memcpy(ptr, ac_data(malloc_sizes[0].cs_cachep), + local_irq_disable_nort(); + BUG_ON(ac_data(malloc_sizes[0].cs_cachep, cpu) != &initarray_generic.cache); + memcpy(ptr, ac_data(malloc_sizes[0].cs_cachep, cpu), sizeof(struct arraycache_init)); - malloc_sizes[0].cs_cachep->array[smp_processor_id()] = ptr; - local_irq_enable(); + malloc_sizes[0].cs_cachep->array[cpu] = ptr; + local_irq_enable_nort(); } /* 5) resize the head arrays to their final sizes */ @@ -972,7 +973,7 @@ *addr++=0x12345678; *addr++=caller; - *addr++=smp_processor_id(); + *addr++=_smp_processor_id(); size -= 3*sizeof(unsigned long); { unsigned long *sptr = &caller; @@ -1203,6 +1204,7 @@ { size_t left_over, slab_size, ralign; kmem_cache_t *cachep = NULL; + int cpu = _smp_processor_id(); /* * Sanity checks... these are all serious usage bugs. @@ -1440,16 +1442,16 @@ * the cache that's used by kmalloc(24), otherwise * the creation of further caches will BUG(). */ - cachep->array[smp_processor_id()] = &initarray_generic.cache; + cachep->array[cpu] = &initarray_generic.cache; g_cpucache_up = PARTIAL; } else { - cachep->array[smp_processor_id()] = kmalloc(sizeof(struct arraycache_init),GFP_KERNEL); + cachep->array[cpu] = kmalloc(sizeof(struct arraycache_init),GFP_KERNEL); } - BUG_ON(!ac_data(cachep)); - ac_data(cachep)->avail = 0; - ac_data(cachep)->limit = BOOT_CPUCACHE_ENTRIES; - ac_data(cachep)->batchcount = 1; - ac_data(cachep)->touched = 0; + BUG_ON(!ac_data(cachep, cpu)); + ac_data(cachep, cpu)->avail = 0; + ac_data(cachep, cpu)->limit = BOOT_CPUCACHE_ENTRIES; + ac_data(cachep, cpu)->batchcount = 1; + ac_data(cachep, cpu)->touched = 0; cachep->batchcount = 1; cachep->limit = BOOT_CPUCACHE_ENTRIES; cachep->free_limit = (1+num_online_cpus())*cachep->batchcount @@ -1503,7 +1505,9 @@ #if DEBUG static void check_irq_off(void) { +#ifndef CONFIG_PREEMPT_RT BUG_ON(!irqs_disabled()); +#endif } static void check_irq_on(void) @@ -1545,22 +1549,39 @@ static void drain_array_locked(kmem_cache_t* cachep, struct array_cache *ac, int force); -static void do_drain(void *arg) +static void do_drain_cpu(kmem_cache_t *cachep, int cpu) { - kmem_cache_t *cachep = (kmem_cache_t*)arg; struct array_cache *ac; check_irq_off(); - ac = ac_data(cachep); + spin_lock(&cachep->spinlock); + ac = ac_data(cachep, cpu); free_block(cachep, &ac_entry(ac)[0], ac->avail); - spin_unlock(&cachep->spinlock); ac->avail = 0; + spin_unlock(&cachep->spinlock); +} + +#ifndef CONFIG_PREEMPT_RT +/* + * Executes in an IRQ context: + */ +static void do_drain(void *arg) +{ + do_drain_cpu((kmem_cache_t*)arg, smp_processor_id()); } +#endif static void drain_cpu_caches(kmem_cache_t *cachep) { +#ifndef CONFIG_PREEMPT_RT smp_call_function_all_cpus(do_drain, cachep); +#else + int cpu; + + for_each_online_cpu(cpu) + do_drain_cpu(cachep, cpu); +#endif check_irq_on(); spin_lock_irq(&cachep->spinlock); if (cachep->lists.shared) @@ -1827,7 +1848,7 @@ spin_unlock(&cachep->spinlock); if (local_flags & __GFP_WAIT) - local_irq_enable(); + local_irq_enable_nort(); /* * The test for missing atomic flag is performed here, rather than @@ -1851,7 +1872,7 @@ cache_init_objs(cachep, slabp, ctor_flags); if (local_flags & __GFP_WAIT) - local_irq_disable(); + local_irq_disable_nort(); check_irq_off(); spin_lock(&cachep->spinlock); @@ -1865,7 +1886,7 @@ kmem_freepages(cachep, objp); failed: if (local_flags & __GFP_WAIT) - local_irq_disable(); + local_irq_disable_nort(); return 0; } @@ -1991,14 +2012,14 @@ #define check_slabp(x,y) do { } while(0) #endif -static void *cache_alloc_refill(kmem_cache_t *cachep, unsigned int __nocast flags) +static void *cache_alloc_refill(kmem_cache_t *cachep, unsigned int __nocast flags, int cpu) { int batchcount; struct kmem_list3 *l3; struct array_cache *ac; check_irq_off(); - ac = ac_data(cachep); + ac = ac_data(cachep, cpu); retry: batchcount = ac->batchcount; if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { @@ -2011,7 +2032,7 @@ l3 = list3_data(cachep); BUG_ON(ac->avail > 0); - spin_lock(&cachep->spinlock); + spin_lock_nort(&cachep->spinlock); if (l3->shared) { struct array_cache *shared_array = l3->shared; if (shared_array->avail) { @@ -2069,14 +2090,17 @@ must_grow: l3->free_objects -= ac->avail; alloc_done: - spin_unlock(&cachep->spinlock); + spin_unlock_nort(&cachep->spinlock); if (unlikely(!ac->avail)) { int x; + spin_unlock_rt(&cachep->spinlock); x = cache_grow(cachep, flags, -1); - + + spin_lock_rt(&cachep->spinlock); // cache_grow can reenable interrupts, then ac could change. - ac = ac_data(cachep); + cpu = smp_processor_id_rt(cpu); + ac = ac_data(cachep, cpu); if (!x && ac->avail == 0) // no objects in sight? abort return NULL; @@ -2145,23 +2169,26 @@ static inline void *__cache_alloc(kmem_cache_t *cachep, unsigned int __nocast flags) { + int cpu = _smp_processor_id(); unsigned long save_flags; void* objp; struct array_cache *ac; cache_alloc_debugcheck_before(cachep, flags); - local_irq_save(save_flags); - ac = ac_data(cachep); + local_irq_save_nort(save_flags); + spin_lock_rt(&cachep->spinlock); + ac = ac_data(cachep, cpu); if (likely(ac->avail)) { STATS_INC_ALLOCHIT(cachep); ac->touched = 1; objp = ac_entry(ac)[--ac->avail]; } else { STATS_INC_ALLOCMISS(cachep); - objp = cache_alloc_refill(cachep, flags); + objp = cache_alloc_refill(cachep, flags, cpu); } - local_irq_restore(save_flags); + spin_unlock_rt(&cachep->spinlock); + local_irq_restore_nort(save_flags); objp = cache_alloc_debugcheck_after(cachep, flags, objp, __builtin_return_address(0)); return objp; } @@ -2231,7 +2258,7 @@ BUG_ON(!batchcount || batchcount > ac->avail); #endif check_irq_off(); - spin_lock(&cachep->spinlock); + spin_lock_nort(&cachep->spinlock); if (cachep->lists.shared) { struct array_cache *shared_array = cachep->lists.shared; int max = shared_array->limit-shared_array->avail; @@ -2266,7 +2293,7 @@ STATS_SET_FREEABLE(cachep, i); } #endif - spin_unlock(&cachep->spinlock); + spin_unlock_nort(&cachep->spinlock); ac->avail -= batchcount; memmove(&ac_entry(ac)[0], &ac_entry(ac)[batchcount], sizeof(void*)*ac->avail); @@ -2281,20 +2308,22 @@ */ static inline void __cache_free(kmem_cache_t *cachep, void *objp) { - struct array_cache *ac = ac_data(cachep); + int cpu = _smp_processor_id(); + struct array_cache *ac = ac_data(cachep, cpu); check_irq_off(); objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0)); + spin_lock_rt(&cachep->spinlock); if (likely(ac->avail < ac->limit)) { STATS_INC_FREEHIT(cachep); ac_entry(ac)[ac->avail++] = objp; - return; } else { STATS_INC_FREEMISS(cachep); cache_flusharray(cachep, ac); ac_entry(ac)[ac->avail++] = objp; } + spin_unlock_rt(&cachep->spinlock); } /** @@ -2395,12 +2424,12 @@ } spin_unlock_irq(&cachep->spinlock); - local_irq_disable(); + local_irq_disable_nort(); if (!cache_grow(cachep, flags, nodeid)) { - local_irq_enable(); + local_irq_enable_nort(); return NULL; } - local_irq_enable(); + local_irq_enable_nort(); } got_slabp: /* found one: allocate object */ @@ -2542,9 +2571,9 @@ { unsigned long flags; - local_irq_save(flags); + local_irq_save_nort(flags); __cache_free(cachep, objp); - local_irq_restore(flags); + local_irq_restore_nort(flags); } EXPORT_SYMBOL(kmem_cache_free); @@ -2568,6 +2597,21 @@ } EXPORT_SYMBOL(kcalloc); +#ifdef CONFIG_RT_DEADLOCK_DETECT +static size_t cache_size(kmem_cache_t *c) +{ + struct cache_sizes *csizep = malloc_sizes; + + for ( ; csizep->cs_size; csizep++) { + if (csizep->cs_cachep == c) + return csizep->cs_size; + if (csizep->cs_dmacachep == c) + return csizep->cs_size; + } + return 0; +} +#endif + /** * kfree - free previously allocated memory * @objp: pointer returned by kmalloc. @@ -2582,11 +2626,16 @@ if (unlikely(!objp)) return; - local_irq_save(flags); + local_irq_save_nort(flags); kfree_debugcheck(objp); c = GET_PAGE_CACHE(virt_to_page(objp)); +#ifdef CONFIG_RT_DEADLOCK_DETECT + if (check_no_locks_freed(objp, objp+cache_size(c))) + printk("slab %s[%p] (%d), obj: %p\n", + c->name, c, c->objsize, objp); +#endif __cache_free(c, (void*)objp); - local_irq_restore(flags); + local_irq_restore_nort(flags); } EXPORT_SYMBOL(kfree); @@ -2625,13 +2674,17 @@ struct array_cache *new[NR_CPUS]; }; +/* + * Executes in IRQ context: + */ static void do_ccupdate_local(void *info) { struct ccupdate_struct *new = (struct ccupdate_struct *)info; struct array_cache *old; +// WARN_ON(!in_interrupt()); check_irq_off(); - old = ac_data(new->cachep); + old = ac_data(new->cachep, smp_processor_id()); new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()]; new->new[smp_processor_id()] = old; @@ -2740,6 +2793,10 @@ if (limit > 32) limit = 32; #endif +#ifdef CONFIG_PREEMPT + if (limit > 16) + limit = 16; +#endif err = do_tune_cpucache(cachep, limit, (limit+1)/2, shared); if (err) printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n", @@ -2779,11 +2836,12 @@ */ static void cache_reap(void *unused) { + int cpu = _smp_processor_id(); struct list_head *walk; if (down_trylock(&cache_chain_sem)) { /* Give up. Setup the next iteration. */ - schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC + smp_processor_id()); + schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC + cpu); return; } @@ -2802,7 +2860,7 @@ spin_lock_irq(&searchp->spinlock); - drain_array_locked(searchp, ac_data(searchp), 0); + drain_array_locked(searchp, ac_data(searchp, cpu), 0); if(time_after(searchp->lists.next_reap, jiffies)) goto next_unlock; @@ -2846,7 +2904,7 @@ check_irq_on(); up(&cache_chain_sem); /* Setup the next iteration */ - schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC + smp_processor_id()); + schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC+cpu); } #ifdef CONFIG_PROC_FS @@ -3067,10 +3125,10 @@ unsigned int size = 0; if (likely(objp != NULL)) { - local_irq_save(flags); + local_irq_save_nort(flags); c = GET_PAGE_CACHE(virt_to_page(objp)); size = kmem_cache_size(c); - local_irq_restore(flags); + local_irq_restore_nort(flags); } return size; Index: linux/mm/highmem.c =================================================================== --- linux.orig/mm/highmem.c 2005-06-06 10:18:16.000000000 -0300 +++ linux/mm/highmem.c 2005-06-06 16:04:14.000000000 -0300 @@ -242,11 +242,11 @@ unsigned long flags; unsigned char *vto; - local_irq_save(flags); + local_irq_save_nort(flags); vto = kmap_atomic(to->bv_page, KM_BOUNCE_READ); memcpy(vto + to->bv_offset, vfrom, to->bv_len); kunmap_atomic(vto, KM_BOUNCE_READ); - local_irq_restore(flags); + local_irq_restore_nort(flags); } #else /* CONFIG_HIGHMEM */ Index: linux/mm/page_alloc.c =================================================================== --- linux.orig/mm/page_alloc.c 2005-06-06 10:18:16.000000000 -0300 +++ linux/mm/page_alloc.c 2005-06-06 16:04:14.000000000 -0300 @@ -369,6 +369,9 @@ int i; arch_free_page(page, order); + if (!PageHighMem(page)) + check_no_locks_freed(page_address(page), + page_address(page+(1<lock, flags); return allocated; } +#endif #if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU) static void __drain_pages(unsigned int cpu) @@ -598,6 +603,7 @@ #endif } +#if !defined(CONFIG_PREEMPT_RT) /* * Free a 0-order page */ @@ -624,15 +630,32 @@ local_irq_restore(flags); put_cpu(); } +#endif +/* + * On PREEMPT_RT we use a simple solution for the time being, + * per-CPU allocation is disabled. + */ void fastcall free_hot_page(struct page *page) { +#if defined(CONFIG_PREEMPT_RT) + if (PageAnon(page)) + page->mapping = NULL; + __free_pages_ok(page, 0); +#else free_hot_cold_page(page, 0); +#endif } void fastcall free_cold_page(struct page *page) { +#ifdef CONFIG_PREEMPT_RT + if (PageAnon(page)) + page->mapping = NULL; + __free_pages_ok(page, 0); +#else free_hot_cold_page(page, 1); +#endif } static inline void prep_zero_page(struct page *page, int order, unsigned int __nocast gfp_flags) @@ -654,6 +677,7 @@ { unsigned long flags; struct page *page = NULL; +#if !defined(CONFIG_PREEMPT_RT) int cold = !!(gfp_flags & __GFP_COLD); if (order == 0) { @@ -672,6 +696,7 @@ local_irq_restore(flags); put_cpu(); } +#endif if (page == NULL) { spin_lock_irqsave(&zone->lock, flags); @@ -950,8 +975,15 @@ { int i = pagevec_count(pvec); - while (--i >= 0) + while (--i >= 0) { +#if defined(CONFIG_PREEMPT_RT) + if (PageAnon(pvec->pages[i])) + pvec->pages[i]->mapping = NULL; + __free_pages_ok(pvec->pages[i], 0); +#else free_hot_cold_page(pvec->pages[i], pvec->cold); +#endif + } } fastcall void __free_pages(struct page *page, unsigned int order) Index: linux/mm/swap.c =================================================================== --- linux.orig/mm/swap.c 2005-06-06 10:18:16.000000000 -0300 +++ linux/mm/swap.c 2005-06-06 16:04:14.000000000 -0300 @@ -136,39 +136,45 @@ * lru_cache_add: add a page to the page lists * @page: the page to add */ -static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs) = { 0, }; -static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs) = { 0, }; +static DEFINE_PER_CPU_LOCKED(struct pagevec, lru_add_pvecs) = { 0, }; +static DEFINE_PER_CPU_LOCKED(struct pagevec, lru_add_active_pvecs) = { 0, }; void fastcall lru_cache_add(struct page *page) { - struct pagevec *pvec = &get_cpu_var(lru_add_pvecs); + int cpu = _smp_processor_id(); + struct pagevec *pvec = &get_cpu_var_locked(lru_add_pvecs, cpu); page_cache_get(page); if (!pagevec_add(pvec, page)) __pagevec_lru_add(pvec); - put_cpu_var(lru_add_pvecs); + put_cpu_var_locked(lru_add_pvecs, cpu); } void fastcall lru_cache_add_active(struct page *page) { - struct pagevec *pvec = &get_cpu_var(lru_add_active_pvecs); + int cpu = _smp_processor_id(); + struct pagevec *pvec = &get_cpu_var_locked(lru_add_active_pvecs, cpu); page_cache_get(page); if (!pagevec_add(pvec, page)) __pagevec_lru_add_active(pvec); - put_cpu_var(lru_add_active_pvecs); + put_cpu_var_locked(lru_add_active_pvecs, cpu); } void lru_add_drain(void) { - struct pagevec *pvec = &get_cpu_var(lru_add_pvecs); + int cpu = _smp_processor_id(); + struct pagevec *pvec; + pvec = &get_cpu_var_locked(lru_add_pvecs, cpu); if (pagevec_count(pvec)) __pagevec_lru_add(pvec); - pvec = &__get_cpu_var(lru_add_active_pvecs); + put_cpu_var_locked(lru_add_pvecs, cpu); + + pvec = &get_cpu_var_locked(lru_add_active_pvecs, cpu); if (pagevec_count(pvec)) __pagevec_lru_add_active(pvec); - put_cpu_var(lru_add_pvecs); + put_cpu_var_locked(lru_add_active_pvecs, cpu); } /* Index: linux/mm/mmap.c =================================================================== --- linux.orig/mm/mmap.c 2005-06-06 10:18:16.000000000 -0300 +++ linux/mm/mmap.c 2005-06-06 16:04:14.000000000 -0300 @@ -1794,10 +1794,16 @@ static inline void verify_mm_writelocked(struct mm_struct *mm) { #ifdef CONFIG_DEBUG_KERNEL - if (unlikely(down_read_trylock(&mm->mmap_sem))) { +# ifdef CONFIG_PREEMPT_RT + if (unlikely(!rt_rwsem_is_locked(&mm->mmap_sem))) { WARN_ON(1); - up_read(&mm->mmap_sem); } +# else + if (unlikely(down_read_trylock(&mm->mmap_sem))) { + WARN_ON(1); + up_read(&mm->mmap_sem); + } +# endif #endif } Index: linux/mm/memory.c =================================================================== --- linux.orig/mm/memory.c 2005-06-06 10:18:16.000000000 -0300 +++ linux/mm/memory.c 2005-06-06 16:04:14.000000000 -0300 @@ -116,7 +116,7 @@ pmd_clear(pmd); pte_free_tlb(tlb, page); dec_page_state(nr_page_table_pages); - tlb->mm->nr_ptes--; + tlb_mm(tlb)->nr_ptes--; } static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, @@ -241,7 +241,7 @@ return; start = addr; - pgd = pgd_offset((*tlb)->mm, addr); + pgd = pgd_offset(tlb_mm(*tlb), addr); do { next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) @@ -250,7 +250,7 @@ } while (pgd++, addr = next, addr != end); if (!tlb_is_full_mm(*tlb)) - flush_tlb_pgtables((*tlb)->mm, start, end); + flush_tlb_pgtables(tlb_mm(*tlb), start, end); } void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma, @@ -551,22 +551,22 @@ page->index > details->last_index)) continue; } - ptent = ptep_get_and_clear(tlb->mm, addr, pte); + ptent = ptep_get_and_clear(tlb_mm(tlb), addr, pte); tlb_remove_tlb_entry(tlb, pte, addr); if (unlikely(!page)) continue; if (unlikely(details) && details->nonlinear_vma && linear_page_index(details->nonlinear_vma, addr) != page->index) - set_pte_at(tlb->mm, addr, pte, + set_pte_at(tlb_mm(tlb), addr, pte, pgoff_to_pte(page->index)); if (pte_dirty(ptent)) set_page_dirty(page); if (PageAnon(page)) - dec_mm_counter(tlb->mm, anon_rss); + dec_mm_counter(tlb_mm(tlb), anon_rss); else if (pte_young(ptent)) mark_page_accessed(page); - tlb->freed++; + tlb_free(tlb); page_remove_rmap(page); tlb_remove_page(tlb, page); continue; @@ -579,7 +579,7 @@ continue; if (!pte_file(ptent)) free_swap_and_cache(pte_to_swp_entry(ptent)); - pte_clear(tlb->mm, addr, pte); + pte_clear(tlb_mm(tlb), addr, pte); } while (pte++, addr += PAGE_SIZE, addr != end); pte_unmap(pte - 1); } Index: linux/kernel/stop_machine.c =================================================================== --- linux.orig/kernel/stop_machine.c 2005-06-06 10:18:16.000000000 -0300 +++ linux/kernel/stop_machine.c 2005-06-06 16:04:14.000000000 -0300 @@ -56,7 +56,7 @@ /* Yield in first stage: migration threads need to * help our sisters onto their CPUs. */ if (!prepared && !irqs_disabled) - yield(); + __yield(); else cpu_relax(); } @@ -110,7 +110,7 @@ /* Wait for them all to come to life. */ while (atomic_read(&stopmachine_thread_ack) != stopmachine_num_threads) - yield(); + __yield(); /* If some failed, kill them all. */ if (ret < 0) { Index: linux/kernel/rt.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux/kernel/rt.c 2005-06-06 16:04:14.000000000 -0300 @@ -0,0 +1,1883 @@ +/* + * kernel/rt.c + * + * Real-Time Preemption Support + * + * started by Ingo Molnar: + * + * Copyright (C) 2004, 2005 Red Hat, Inc., Ingo Molnar + * + * lock debugging, locking tree, deadlock detection: + * + * Copyright (C) 2004, LynuxWorks, Inc., Igor Manyilov, Bill Huey + * Released under the General Public License (GPL). + * + * Includes portions of the generic R/W semaphore implementation from: + * + * Copyright (c) 2001 David Howells (dhowells@redhat.com). + * - Derived partially from idea by Andrea Arcangeli + * - Derived also from comments by Linus + * + * Pending ownership of locks and ownership stealing: + * + * Copyright (C) 2005, Kihon Technologies Inc., Steven Rostedt + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * These flags are used for allowing of stealing of ownerships. + */ +#define RT_PENDOWNER 1 /* pending owner on a lock */ + +#define TASK_PENDING(task) \ + ((task)->rt_flags & RT_PENDOWNER) + +/* + * This flag is good for debugging the PI code - it makes all tasks + * in the system fall under PI handling. Normally only SCHED_FIFO/RR + * tasks are PI-handled: + */ +//#define ALL_TASKS_PI + +/* + * We need a global lock for priority inheritance handling. + * This is only for the slow path, but still, we might want + * to optimize it later to be more scalable. + */ +static __cacheline_aligned_in_smp raw_spinlock_t pi_lock = + RAW_SPIN_LOCK_UNLOCKED; + +#ifdef CONFIG_RT_DEADLOCK_DETECT +/* + * We need a global lock when we walk through the multi-process + * lock tree... + */ +static raw_spinlock_t trace_lock = RAW_SPIN_LOCK_UNLOCKED; + +static LIST_HEAD(held_locks); + +/* + * deadlock detection flag. We turn it off when we detect + * the first problem because we dont want to recurse back + * into the tracing code when doing error printk or + * executing a BUG(): + */ +static int trace_on = 1; + +void deadlock_trace_off(void) +{ + trace_on = 0; +} + +#define trace_lock_irq(lock) \ + do { \ + local_irq_disable(); \ + if (trace_on) \ + spin_lock(lock); \ + } while (0) + +#define trace_unlock(lock) \ + do { \ + if (trace_on) \ + spin_unlock(lock); \ + } while (0) + +#define trace_unlock_irq(lock) \ + do { \ + if (trace_on) \ + spin_unlock(lock); \ + local_irq_enable(); \ + preempt_check_resched(); \ + } while (0) + +#define trace_lock_irqsave(lock, flags) \ + do { \ + local_irq_save(flags); \ + if (trace_on) \ + spin_lock(lock); \ + } while (0) + +#define trace_unlock_irqrestore(lock, flags) \ + do { \ + if (trace_on) \ + spin_unlock(lock); \ + local_irq_restore(flags); \ + preempt_check_resched(); \ + } while (0) + +#define TRACE_OFF() \ +do { \ + if (trace_on) { \ + trace_on = 0; \ + console_verbose(); \ + spin_unlock(&trace_lock); \ + } \ +} while (0) + +#define TRACE_BUG() \ +do { \ + TRACE_OFF(); \ + BUG(); \ +} while (0) + +#define TRACE_WARN_ON(c) \ +do { \ + if (c) { \ + TRACE_OFF(); \ + WARN_ON(1); \ + } \ +} while (0) + +#else +# define trace_lock_irq(lock) local_irq_disable() +# define trace_lock_irqsave(lock, flags) local_irq_save(flags) +# define trace_unlock(lock) do { } while (0) + +# define trace_unlock_irq(lock) \ + do { local_irq_enable(); preempt_check_resched(); } while (0) + +# define trace_unlock_irqrestore(lock, flags) \ + do { local_irq_restore(flags); preempt_check_resched(); } while (0) + +# define TRACE_BUG() do { } while (0) +# define TRACE_WARN_ON(c) do { } while (0) +# define TRACE_OFF() do { } while (0) +#endif /* CONFIG_RT_DEADLOCK_DETECT */ + +#define TRACE_BUG_ON(c) do { if (c) TRACE_BUG(); } while (0) + +/* + * Unlock these on crash: + */ +void zap_rt_locks(void) +{ + spin_lock_init(&pi_lock); +#ifdef CONFIG_RT_DEADLOCK_DETECT + spin_lock_init(&trace_lock); +#endif +} + +#ifdef CONFIG_RT_DEADLOCK_DETECT + +static void printk_task(struct task_struct *p) +{ + if (p) + printk("%16s:%5d [%p, %3d]", p->comm, p->pid, p, p->prio); + else + printk(""); +} + +static void printk_task_short(struct task_struct *p) +{ + if (p) + printk("%s/%d [%p, %3d]", p->comm, p->pid, p, p->prio); + else + printk(""); +} + +static void printk_lock(struct rt_mutex *lock, int print_owner) +{ + if (lock->name) + printk(" [%p] {%s}\n", + lock, lock->name); + else + printk(" [%p] {%s:%d}\n", + lock, lock->file, lock->line); + + if (print_owner && lock->owner) { + printk(".. held by: "); + printk_task(lock->owner); + printk("\n"); + } + if (lock->owner) { + printk("... acquired at: "); + print_symbol("%s\n", lock->acquire_eip); + } +} + +static void printk_waiter(struct rt_mutex_waiter *w) +{ + printk("-------------------------\n"); + printk("| waiter struct %p:\n", w); + printk("| w->task:\n"); + printk_task(w->task); + printk("\n| lock:\n"); + printk_lock(w->lock, 1); + printk("| blocked at: "); + print_symbol("%s\n", w->eip); + printk("-------------------------\n"); +} + +static void show_task_locks(struct task_struct *p) +{ + switch (p->state) { + case TASK_RUNNING: printk("R"); break; + case TASK_RUNNING_MUTEX: printk("M"); break; + case TASK_INTERRUPTIBLE: printk("S"); break; + case TASK_UNINTERRUPTIBLE: printk("D"); break; + case TASK_STOPPED: printk("T"); break; + case EXIT_ZOMBIE: printk("Z"); break; + case EXIT_DEAD: printk("X"); break; + default: printk("?"); break; + } + printk_task(p); + if (p->blocked_on) { + struct rt_mutex *lock = p->blocked_on->lock; + + printk(" blocked on:"); + printk_lock(lock, 1); + } else + printk(" (not blocked)\n"); +} + +static void show_held_locks(struct task_struct *filter) +{ + struct list_head *curr, *cursor = NULL; + struct rt_mutex *lock; + struct task_struct *p; + unsigned long flags; + int count = 0; + + printk("\n"); + if (filter) { + printk("------------------------------\n"); + printk("| showing all locks held by: | ("); + printk_task_short(filter); + printk("):\n"); + printk("------------------------------\n"); + } else { + printk("---------------------------\n"); + printk("| showing all locks held: |\n"); + printk("---------------------------\n"); + } + + /* + * Play safe and acquire the global trace lock. We + * cannot printk with that lock held so we iterate + * very carefully: + */ +next: + trace_lock_irqsave(&trace_lock, flags); + list_for_each(curr, &held_locks) { + if (cursor && curr != cursor) + continue; + lock = list_entry(curr, struct rt_mutex, held_list); + p = lock->owner; + if (filter && (p != filter)) + continue; + count++; + cursor = curr->next; + trace_unlock_irqrestore(&trace_lock, flags); + + printk("\n#%03d: ", count); + printk_lock(lock, filter ? 0 : 1); + goto next; + } + trace_unlock_irqrestore(&trace_lock, flags); +} + +void show_all_locks(void) +{ + struct task_struct *g, *p; + int count = 10; + int unlock = 1; + + printk("\nshowing all tasks:\n"); + + /* + * Here we try to get the tasklist_lock as hard as possible, + * if not successful after 2 seconds we ignore it (but keep + * trying). This is to enable a debug printout even if a + * tasklist_lock-holding task deadlocks or crashes. + */ +retry: + if (!read_trylock(&tasklist_lock)) { + if (count == 10) + printk("hm, tasklist_lock locked, retrying... "); + if (count) { + count--; + printk(" #%d", 10-count); + mdelay(200); + goto retry; + } + printk(" ignoring it.\n"); + unlock = 0; + } + if (count != 10) + printk(" locked it.\n"); + + do_each_thread(g, p) { + show_task_locks(p); + if (!unlock) + if (read_trylock(&tasklist_lock)) + unlock = 1; + } while_each_thread(g, p); + + show_held_locks(NULL); + printk("=============================================\n\n"); + + if (unlock) + read_unlock(&tasklist_lock); +} + +static int check_deadlock(struct rt_mutex *lock, int depth, + unsigned long eip) +{ + struct rt_mutex *lockblk; + struct task_struct *task; + + if (!trace_on) + return 0; + /* + * Special-case: the BKL self-releases at schedule() + * time so it can never deadlock: + */ + if (lock == &kernel_sem.lock) + return 0; + task = lock->owner; + if (!task) + return 0; + lockblk = NULL; + if (task->blocked_on) + lockblk = task->blocked_on->lock; + if (current == task) { + TRACE_OFF(); + if (depth) + return 1; + printk("\n==========================================\n"); + printk( "[ BUG: lock recursion deadlock detected! |\n"); + printk( "------------------------------------------\n"); + printk("already locked: "); + printk_lock(lock, 1); + show_held_locks(task); + printk("\n-{current task's backtrace}----------------->\n"); + dump_stack(); + show_all_locks(); + printk("[ turning off deadlock detection. Please report this trace. ]\n\n"); + local_irq_disable(); + return 0; + } + /* + * Skip the BKL: + */ + if (lockblk == &kernel_sem.lock) + return 0; + /* + * Ugh, something corrupted the lock data structure? + */ + if (depth > 30) { + TRACE_OFF(); + printk("\n===========================================\n"); + printk( "[ BUG: infinite lock dependency detected!? |\n"); + printk( "-------------------------------------------\n"); + goto print_it; + } + if (lockblk && check_deadlock(lockblk, depth+1, eip)) { + printk("\n============================================\n"); + printk( "[ BUG: circular locking deadlock detected! ]\n"); + printk( "--------------------------------------------\n"); +print_it: + printk("%s/%d is deadlocking current task %s/%d\n\n", + task->comm, task->pid, current->comm, current->pid); + printk("\n1) %s/%d is trying to acquire this lock:\n", + current->comm, current->pid); + printk_lock(lock, 1); + + printk("... trying at: "); + print_symbol("%s\n", eip); + + printk("\n2) %s/%d is blocked on this lock:\n", + task->comm, task->pid); + printk_lock(lockblk, 1); + + show_held_locks(current); + show_held_locks(task); + + printk("\n%s/%d's [blocked] stackdump:\n\n", task->comm, task->pid); + show_stack(task, NULL); + printk("\n%s/%d's [current] stackdump:\n\n", + current->comm, current->pid); + dump_stack(); + show_all_locks(); + printk("[ turning off deadlock detection. Please report this trace. ]\n\n"); + local_irq_disable(); + return 0; + } + return 0; +} + +void check_no_held_locks(struct task_struct *task) +{ + struct list_head *curr, *next, *cursor = NULL; + struct plist *curr1; + struct rt_mutex *lock; + struct rt_mutex_waiter *w; + struct task_struct *p; + unsigned long flags; + + if (!trace_on) + return; +restart: + trace_lock_irqsave(&trace_lock, flags); + list_for_each_safe(curr, next, &held_locks) { + if (cursor && curr != cursor) + continue; + lock = list_entry(curr, struct rt_mutex, held_list); + p = lock->owner; + if (p != task) + continue; + cursor = next; + list_del_init(curr); + trace_unlock_irqrestore(&trace_lock, flags); + + if (lock == &kernel_sem.lock) { + printk("BUG: %s/%d, BKL held at task exit time!\n", + current->comm, current->pid); + printk("BKL acquired at: "); + print_symbol("%s\n", + (unsigned long) current->last_kernel_lock); + } else + printk("BUG: %s/%d, lock held at task exit time!\n", + current->comm, current->pid); + printk_lock(lock, 1); + if (lock->owner != task) + printk("exiting task is not even the owner??\n"); + goto restart; + } + spin_lock(&pi_lock); + plist_for_each(curr1, &task->pi_waiters) { + w = plist_entry(curr1, struct rt_mutex_waiter, pi_list); + TRACE_OFF(); + spin_unlock(&pi_lock); + trace_unlock_irqrestore(&trace_lock, flags); + + printk("hm, PI interest held at exit time? Task:\n"); + printk_task(task); + printk_waiter(w); + return; + } + spin_unlock(&pi_lock); + trace_unlock_irqrestore(&trace_lock, flags); +} + +int check_no_locks_freed(const void *from, const void *to) +{ + struct list_head *curr, *next, *cursor = NULL; + struct rt_mutex *lock; + unsigned long flags; + void *lock_addr; + int err = 0; + + if (!trace_on) + return err; +restart: + trace_lock_irqsave(&trace_lock, flags); + list_for_each_safe(curr, next, &held_locks) { + if (cursor && curr != cursor) + continue; + lock = list_entry(curr, struct rt_mutex, held_list); + lock_addr = lock; + if (lock_addr < from || lock_addr >= to) + continue; + cursor = next; + list_del_init(curr); + TRACE_OFF(); + trace_unlock_irqrestore(&trace_lock, flags); + err = 1; + + printk("BUG: %s/%d, active lock [%p(%p-%p)] freed!\n", + current->comm, current->pid, lock, from, to); + dump_stack(); + printk_lock(lock, 1); + if (lock->owner != current) + printk("freeing task is not even the owner??\n"); + goto restart; + } + trace_unlock_irqrestore(&trace_lock, flags); + + return err; +} + +#endif + +#if defined(ALL_TASKS_PI) && defined(CONFIG_RT_DEADLOCK_DETECT) + +static void +check_pi_list_present(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, + struct task_struct *old_owner) +{ + struct rt_mutex_waiter *w; + struct plist *curr1; + + TRACE_WARN_ON(plist_empty(&waiter->pi_list)); + TRACE_WARN_ON(lock->owner); + + plist_for_each(curr1, &old_owner->pi_waiters) { + w = plist_entry(curr1, struct rt_mutex_waiter, pi_list); + if (w == waiter) + goto ok; + } + TRACE_WARN_ON(1); +ok: +} + +static void +check_pi_list_empty(struct rt_mutex *lock, struct task_struct *old_owner) +{ + struct rt_mutex_waiter *w; + struct plist *curr1; + + plist_for_each(curr1, &old_owner->pi_waiters) { + w = plist_entry(curr1, struct rt_mutex_waiter, pi_list); + if (w->lock == lock) { + TRACE_OFF(); + printk("hm, PI interest but no waiter? Old owner:\n"); + printk_waiter(w); + printk("\n"); + TRACE_WARN_ON(1); + return; + } + } +} + +#else + +static inline void +check_pi_list_present(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, + struct task_struct *old_owner) +{ +} + +static inline void +check_pi_list_empty(struct rt_mutex *lock, struct task_struct *old_owner) +{ +} + +#endif + +/* + * Move PI waiters of this lock to the new owner: + */ +static void +change_owner(struct rt_mutex *lock, struct task_struct *old_owner, + struct task_struct *new_owner) +{ + struct plist *next1, *curr1; + struct rt_mutex_waiter *w; + int requeued = 0, sum = 0; + + if (old_owner == new_owner) + return; + + plist_for_each_safe(curr1, next1, &old_owner->pi_waiters) { + w = plist_entry(curr1, struct rt_mutex_waiter, pi_list); + if (w->lock == lock) { + plist_del(&w->pi_list, &old_owner->pi_waiters); + plist_init(&w->pi_list, w->task->prio); + plist_add(&w->pi_list, &new_owner->pi_waiters); + requeued++; + } + sum++; + } + trace_special(sum, requeued, 0); +} + +int pi_walk, pi_null, pi_prio; + +static void pi_setprio(struct rt_mutex *lock, struct task_struct *p, int prio) +{ + if (unlikely(!p->pid)) { + pi_null++; + return; + } + +#ifdef CONFIG_RT_DEADLOCK_DETECT + pi_prio++; + if (p->policy != SCHED_NORMAL && prio > mutex_getprio(p)) { + TRACE_OFF(); + + printk("huh? (%d->%d??)\n", p->prio, prio); + printk("owner:\n"); + printk_task(p); + printk("\ncurrent:\n"); + printk_task(current); + printk("\nlock:\n"); + printk_lock(lock, 1); + dump_stack(); + local_irq_disable(); + } +#endif + /* + * If the task is blocked on some other task then boost that + * other task (or tasks) too: + */ + for (;;) { + struct rt_mutex_waiter *w = p->blocked_on; + int was_rt = rt_task(p); + + mutex_setprio(p, prio); + if (!w) + break; + /* + * If the task is blocked on a lock, and we just made + * it RT, then register the task in the PI list and + * requeue it to the wait list: + */ + lock = w->lock; + TRACE_BUG_ON(!lock); + TRACE_BUG_ON(!lock->owner); + if (rt_task(p) && plist_empty(&w->pi_list)) { + TRACE_BUG_ON(was_rt); + plist_init(&w->pi_list, prio); + plist_add(&w->pi_list, &lock->owner->pi_waiters); + + plist_del(&w->list, &lock->wait_list); + plist_init(&w->list, prio); + plist_add(&w->list, &lock->wait_list); + + } + /* + * If the task is blocked on a lock, and we just restored + * it from RT to non-RT then unregister the task from + * the PI list and requeue it to the wait list. + * + * (TODO: this can be unfair to SCHED_NORMAL tasks if they + * get PI handled.) + */ + if (!rt_task(p) && !plist_empty(&w->pi_list)) { + TRACE_BUG_ON(!was_rt); + plist_del(&w->pi_list, &lock->owner->pi_waiters); + plist_del(&w->list, &lock->wait_list); + plist_init(&w->list, prio); + plist_add(&w->list, &lock->wait_list); + + } + + pi_walk++; + + p = lock->owner; + TRACE_BUG_ON(!p); + /* + * If the dependee is already higher-prio then + * no need to boost it, and all further tasks down + * the dependency chain are already boosted: + */ + if (p->prio <= prio) + break; + } +} + +static void +task_blocks_on_lock(struct rt_mutex_waiter *waiter, struct task_struct *task, + struct rt_mutex *lock, unsigned long eip) +{ +#ifdef CONFIG_RT_DEADLOCK_DETECT + check_deadlock(lock, 0, eip); + /* mark the current thread as blocked on the lock */ + waiter->eip = eip; +#endif + task->blocked_on = waiter; + waiter->lock = lock; + waiter->task = task; + plist_init(&waiter->pi_list, task->prio); + /* + * Add SCHED_NORMAL tasks to the end of the waitqueue (FIFO): + */ +#ifndef ALL_TASKS_PI + if (!rt_task(task)) { + plist_add(&waiter->list, &lock->wait_list); + return; + } +#endif + spin_lock(&pi_lock); + plist_add(&waiter->pi_list, &lock->owner->pi_waiters); + /* + * Add RT tasks to the head: + */ + plist_add(&waiter->list, &lock->wait_list); + /* + * If the waiter has higher priority than the owner + * then temporarily boost the owner: + */ + if (task->prio < lock->owner->prio) + pi_setprio(lock, lock->owner, task->prio); + spin_unlock(&pi_lock); +} + +/* + * initialise the lock: + */ +static void __init_rt_mutex(struct rt_mutex *lock, int save_state, + char *name, char *file, int line) +{ + lock->owner = NULL; + spin_lock_init(&lock->wait_lock); + preempt_disable(); + plist_init(&lock->wait_list, MAX_PRIO); + preempt_enable(); +#ifdef CONFIG_RT_DEADLOCK_DETECT + lock->save_state = save_state; + INIT_LIST_HEAD(&lock->held_list); + lock->name = name; + lock->file = file; + lock->line = line; +#endif +} + +void fastcall __init_rwsem(struct rw_semaphore *rwsem, int save_state, + char *name, char *file, int line) +{ + __init_rt_mutex(&rwsem->lock, save_state, name, file, line); + rwsem->read_depth = 0; +} +EXPORT_SYMBOL(__init_rwsem); + +static void set_new_owner(struct rt_mutex *lock, struct task_struct *old_owner, + struct task_struct *new_owner, unsigned long eip) +{ + if (new_owner) + trace_special_pid(new_owner->pid, new_owner->prio, 0); + if (old_owner) + change_owner(lock, old_owner, new_owner); + lock->owner = new_owner; + lock->owner_prio = new_owner->prio; +#ifdef CONFIG_RT_DEADLOCK_DETECT + TRACE_WARN_ON(!list_empty(&lock->held_list)); + list_add_tail(&lock->held_list, &held_locks); + lock->acquire_eip = eip; +#endif +} + +/* + * handle the lock release when processes blocked on it that can now run + * - the spinlock must be held by the caller + */ +static inline struct task_struct * pick_new_owner(struct rt_mutex *lock, + struct task_struct *old_owner, int save_state, + unsigned long eip) +{ + struct rt_mutex_waiter *waiter = NULL; + struct task_struct *new_owner; + + /* + * Get the highest prio one: + * + * (same-prio RT tasks go FIFO) + */ + waiter = plist_first_entry(&lock->wait_list, struct rt_mutex_waiter, list); + + trace_special_pid(waiter->task->pid, waiter->task->prio, 0); + +#ifdef ALL_TASKS_PI + check_pi_list_present(lock, waiter, old_owner); +#endif + new_owner = waiter->task; + plist_del_init(&waiter->list, &lock->wait_list); + + plist_del(&waiter->pi_list, &old_owner->pi_waiters); + plist_init(&waiter->pi_list, waiter->task->prio); + + set_new_owner(lock, old_owner, new_owner, waiter->eip); + /* Don't touch waiter after ->task has been NULLed */ + mb(); + waiter->task = NULL; + new_owner->blocked_on = NULL; + TRACE_WARN_ON(save_state != lock->save_state); + + return new_owner; +} + +static inline void init_lists(struct rt_mutex *lock) +{ + // we have to do this until the static initializers get fixed: + if (!lock->wait_list.dp_node.prev && !lock->wait_list.dp_node.next) + plist_init(&lock->wait_list, MAX_PRIO); +#ifdef CONFIG_RT_DEADLOCK_DETECT + if (!lock->held_list.prev && !lock->held_list.next) + INIT_LIST_HEAD(&lock->held_list); +#endif +} + +/* + * Try to grab a lock, and if it is owned but the owner + * hasn't woken up yet, see if we can steal it. + * + * Return: 1 if task can grab lock. + * 0 if not. + */ +static int grab_lock(struct rt_mutex *lock, struct task_struct *task) +{ + struct task_struct *owner = lock->owner; + + if (!owner) + return 1; + /* + * The lock is owned, but now test to see if the owner + * is still sleeping and hasn't woken up to get the lock. + */ + + /* Test the simple case first, is it already running? */ + if (!TASK_PENDING(owner)) + return 0; + + /* The owner is pending on a lock, but is it this lock? */ + if (owner->pending_owner != lock) + return 0; + + /* + * There's an owner, but it hasn't woken up to take the lock yet. + * See if we should steal it from him. + */ + if (task->prio > owner->prio) + return 0; + + /* + * The BKL is a PITA. Don't ever steal it + */ + if (lock == &kernel_sem.lock) + return 0; + + /* + * This task is of higher priority than the current pending + * owner, so we may steal it. + */ + owner->rt_flags &= ~RT_PENDOWNER; + owner->pending_owner = NULL; + +#ifdef CONFIG_RT_DEADLOCK_DETECT + /* + * This task will be taking the ownership away, and + * when it does, the lock can't be on the held list. + */ + TRACE_WARN_ON(list_empty(&lock->held_list)); + list_del_init(&lock->held_list); +#endif + return 1; +} + +/* + * Bring a task from pending ownership to owning a lock. + * + * Return 0 if we secured it, otherwise non-zero if it was + * stolen. + */ +static int capture_lock(struct rt_mutex_waiter *waiter, struct task_struct *task) +{ + struct rt_mutex *lock = waiter->lock; + unsigned long flags; + int ret = 0; + + /* + * The BKL is special, we always get it. + */ + if (lock == &kernel_sem.lock) + return 0; + + trace_lock_irqsave(&trace_lock, flags); + spin_lock(&lock->wait_lock); + + if (!(task->rt_flags & RT_PENDOWNER)) { + /* someone else stole it */ + TRACE_BUG_ON(lock->owner == task); + if (grab_lock(lock,task)) { + /* we got it back! */ + struct task_struct *old_owner = lock->owner; + spin_lock(&pi_lock); + set_new_owner(lock, old_owner, task, waiter->eip); + spin_unlock(&pi_lock); + ret = 0; + } else { + /* Add ourselves back to the list */ + task_blocks_on_lock(waiter,task,lock,waiter->eip); + ret = 1; + } + } else { + task->rt_flags &= ~RT_PENDOWNER; + task->pending_owner = NULL; + } + + spin_unlock(&lock->wait_lock); + trace_unlock_irqrestore(&trace_lock, flags); + + return ret; +} + +/* + * lock it semaphore-style: no worries about missed wakeups. + */ +static void __sched __down(struct rt_mutex *lock, unsigned long eip) +{ + struct task_struct *task = current; + unsigned long flags, nosched_flag; + struct rt_mutex_waiter waiter; + + trace_lock_irqsave(&trace_lock, flags); + TRACE_BUG_ON(!irqs_disabled()); + spin_lock(&lock->wait_lock); + + init_lists(lock); + + if (grab_lock(lock,task)) { + /* granted */ + struct task_struct *old_owner = lock->owner; + TRACE_WARN_ON(!plist_empty(&lock->wait_list) && !old_owner); + spin_lock(&pi_lock); + set_new_owner(lock, old_owner, task, eip); + spin_unlock(&pi_lock); + spin_unlock(&lock->wait_lock); + trace_unlock_irqrestore(&trace_lock, flags); + + return; + } + + set_task_state(task, TASK_UNINTERRUPTIBLE); + + plist_init(&waiter.list, task->prio); + task_blocks_on_lock(&waiter, task, lock, eip); + + TRACE_BUG_ON(!irqs_disabled()); + /* we don't need to touch the lock struct anymore */ + spin_unlock(&lock->wait_lock); + trace_unlock_irqrestore(&trace_lock, flags); + + might_sleep(); + + nosched_flag = current->flags & PF_NOSCHED; + current->flags &= ~PF_NOSCHED; + +wait_again: + /* wait to be given the lock */ + for (;;) { + if (!waiter.task) + break; + schedule(); + set_task_state(task, TASK_UNINTERRUPTIBLE); + } + /* + * Check to see if we didn't have ownership stolen. + */ + if (capture_lock(&waiter,task)) { + set_task_state(task, TASK_UNINTERRUPTIBLE); + goto wait_again; + } + + current->flags |= nosched_flag; + task->state = TASK_RUNNING; +} + +/* + * get a write lock on the rw-semaphore + */ +void fastcall __sched rt_down_write(struct rw_semaphore *rwsem) +{ + __down(&rwsem->lock, CALLER_ADDR0); +} +EXPORT_SYMBOL(rt_down_write); + +/* + * get a read lock on the rw-semaphore + */ +void fastcall __sched rt_down_read(struct rw_semaphore *rwsem) +{ + /* + * Read locks within the write lock succeed. + */ + if (rwsem->lock.owner == current) { + rwsem->read_depth++; + return; + } + return __down(&rwsem->lock, CALLER_ADDR0); +} +EXPORT_SYMBOL(rt_down_read); + +/* + * lock it mutex-style: this variant is very careful not to + * miss any non-mutex wakeups. + * + * The wakeup side uses wake_up_process_mutex, which, combined with + * the xchg code of this function is a transparent sleep/wakeup + * mechanism nested within any existing sleep/wakeup mechanism. This + * enables the seemless use of arbitrary (blocking) spinlocks within + * sleep/wakeup event loops. + */ +static void __sched __down_mutex(struct rt_mutex *lock, unsigned long eip) +{ + unsigned long state, saved_state, nosched_flag; + struct task_struct *task = current; + struct rt_mutex_waiter waiter; + unsigned long flags; + int got_wakeup = 0; + + trace_lock_irqsave(&trace_lock, flags); + TRACE_BUG_ON(!irqs_disabled()); + spin_lock(&lock->wait_lock); + + init_lists(lock); + + if (grab_lock(lock,task)) { + /* granted */ + struct task_struct *old_owner = lock->owner; + TRACE_WARN_ON(!plist_empty(&lock->wait_list) && !old_owner); + spin_lock(&pi_lock); + set_new_owner(lock, old_owner, task, eip); + spin_unlock(&pi_lock); + spin_unlock(&lock->wait_lock); + trace_unlock_irqrestore(&trace_lock, flags); + + return; + } + + plist_init(&waiter.list, task->prio); + task_blocks_on_lock(&waiter, task, lock, eip); + + TRACE_BUG_ON(!irqs_disabled()); + /* + * Here we save whatever state the task was in originally, + * we'll restore it at the end of the function and we'll + * take any intermediate wakeup into account as well, + * independently of the mutex sleep/wakeup mechanism: + */ + saved_state = xchg(&task->state, TASK_UNINTERRUPTIBLE); + + /* we don't need to touch the lock struct anymore */ + spin_unlock(&lock->wait_lock); + trace_unlock(&trace_lock); + + /* + * TODO: check 'flags' for the IRQ bit here - it is illegal to + * call down() from an IRQs-off section that results in + * an actual reschedule. + */ + + nosched_flag = current->flags & PF_NOSCHED; + current->flags &= ~PF_NOSCHED; + +wait_again: + /* wait to be given the lock */ + for (;;) { + unsigned long saved_flags = current->flags & PF_NOSCHED; + + if (!waiter.task) + break; + local_irq_enable(); + current->flags &= ~PF_NOSCHED; + schedule(); + current->flags |= saved_flags; + local_irq_disable(); + state = xchg(&task->state, TASK_UNINTERRUPTIBLE); + if (state == TASK_RUNNING) + got_wakeup = 1; + } + /* + * Check to see if we didn't have ownership stolen. + */ + if (capture_lock(&waiter,task)) { + state = xchg(&task->state, TASK_UNINTERRUPTIBLE); + if (state == TASK_RUNNING) + got_wakeup = 1; + goto wait_again; + } + + /* + * Only set the task's state to TASK_RUNNING if it got + * a non-mutex wakeup. We keep the original state otherwise. + * A mutex wakeup changes the task's state to TASK_RUNNING_MUTEX, + * not TASK_RUNNING - hence we can differenciate between the two + * cases: + */ + state = xchg(&task->state, saved_state); + if (state == TASK_RUNNING) + got_wakeup = 1; + if (got_wakeup) + task->state = TASK_RUNNING; + local_irq_enable(); + preempt_check_resched(); + + current->flags |= nosched_flag; +} + +/* + * TODO: push this into __down_mutex() + * + * BKL users expect the BKL to be held across spinlock/rwlock-acquire. + * Save and clear it, this will cause the scheduler to not drop the + * BKL semaphore if we end up scheduling: + */ +#define SAVE_BKL(ACTION) \ +{ \ + struct task_struct *task = current; \ + unsigned int saved_lock_depth; \ + \ + saved_lock_depth = task->lock_depth; \ + task->lock_depth = -1; \ + \ + might_sleep(); \ + ACTION; \ + \ + task->lock_depth = saved_lock_depth; \ +} + + +static void __sched down_write_mutex(struct rw_semaphore *rwsem, + unsigned long eip) +{ + SAVE_BKL(__down_mutex(&rwsem->lock, eip)); +} + +static void __sched down_read_mutex(struct rw_semaphore *rwsem, + unsigned long eip) +{ + /* + * Read locks within the write lock succeed. + */ + if (rwsem->lock.owner == current) { + rwsem->read_depth++; + return; + } + SAVE_BKL(__down_mutex(&rwsem->lock, eip)); +} + +/* + * get a lock - interruptible + */ +static int __sched __down_interruptible(struct rt_mutex *lock, + unsigned long eip) +{ + struct task_struct *task = current; + unsigned long flags, nosched_flag; + struct rt_mutex_waiter waiter; + int ret; + + trace_lock_irqsave(&trace_lock, flags); + TRACE_BUG_ON(!irqs_disabled()); + spin_lock(&lock->wait_lock); + + init_lists(lock); + + if (grab_lock(lock,task)) { + /* granted */ + struct task_struct *old_owner = lock->owner; + TRACE_WARN_ON(!plist_empty(&lock->wait_list) && !old_owner); + spin_lock(&pi_lock); + set_new_owner(lock, old_owner, task, eip); + spin_unlock(&pi_lock); + spin_unlock(&lock->wait_lock); + trace_unlock_irqrestore(&trace_lock, flags); + + return 0; + } + + set_task_state(task, TASK_INTERRUPTIBLE); + + plist_init(&waiter.list, task->prio); + task_blocks_on_lock(&waiter, task, lock, eip); + + TRACE_BUG_ON(!irqs_disabled()); + /* we don't need to touch the lock struct anymore */ + spin_unlock(&lock->wait_lock); + trace_unlock_irqrestore(&trace_lock, flags); + + might_sleep(); + + nosched_flag = current->flags & PF_NOSCHED; + current->flags &= ~PF_NOSCHED; + + ret = 0; +wait_again: + /* wait to be given the lock */ + for (;;) { + if (signal_pending(current)) { + /* + * Remove ourselves from the wait list if we + * didnt get the lock - else return success: + */ + trace_lock_irq(&trace_lock); + spin_lock(&lock->wait_lock); + if (waiter.task) { + plist_del_init(&waiter.list, &lock->wait_list); + /* + * Just remove ourselves from the PI list. + * (No big problem if our PI effect lingers + * a bit - owner will restore prio.) + */ + spin_lock(&pi_lock); + plist_del(&waiter.pi_list, &lock->owner->pi_waiters); + plist_init(&waiter.pi_list, waiter.task->prio); + spin_unlock(&pi_lock); + ret = -EINTR; + } + spin_unlock(&lock->wait_lock); + trace_unlock_irq(&trace_lock); + break; + } + if (!waiter.task) + break; + schedule(); + set_task_state(task, TASK_INTERRUPTIBLE); + } + + /* + * Check to see if we didn't have ownership stolen. + */ + if (!ret) { + if (capture_lock(&waiter,task)) { + set_task_state(task, TASK_INTERRUPTIBLE); + goto wait_again; + } + } + + task->state = TASK_RUNNING; + current->flags |= nosched_flag; + + return ret; +} + +/* + * trylock for writing -- returns 1 if successful, 0 if contention + */ +static int __down_trylock(struct rt_mutex *lock, unsigned long eip) +{ + struct task_struct *task = current; + unsigned long flags; + int ret = 0; + + trace_lock_irqsave(&trace_lock, flags); + TRACE_BUG_ON(!irqs_disabled()); + spin_lock(&lock->wait_lock); + + init_lists(lock); + + if (grab_lock(lock,task)) { + /* granted */ + struct task_struct *old_owner = lock->owner; + TRACE_WARN_ON(!plist_empty(&lock->wait_list) && !old_owner); + spin_lock(&pi_lock); + set_new_owner(lock, old_owner, task, eip); + spin_unlock(&pi_lock); + ret = 1; + } + + spin_unlock(&lock->wait_lock); + trace_unlock_irqrestore(&trace_lock, flags); + + return ret; +} + +int fastcall rt_down_write_trylock(struct rw_semaphore *rwsem) +{ + return __down_trylock(&rwsem->lock, CALLER_ADDR0); +} +EXPORT_SYMBOL(rt_down_write_trylock); + +/* + * trylock for reading -- returns 1 if successful, 0 if contention + */ +int fastcall rt_down_read_trylock(struct rw_semaphore *rwsem) +{ + /* + * Read locks within the self-held write lock succeed. + */ + if (rwsem->lock.owner == current) { + rwsem->read_depth++; + return 1; + } + return __down_trylock(&rwsem->lock, CALLER_ADDR0); +} +EXPORT_SYMBOL(rt_down_read_trylock); + +static int down_write_trylock_mutex(struct rw_semaphore *rwsem) +{ + return __down_trylock(&rwsem->lock, CALLER_ADDR0); +} + +static int down_read_trylock_mutex(struct rw_semaphore *rwsem) +{ + /* + * Read locks within the self-held write lock succeed. + */ + if (rwsem->lock.owner == current) { + rwsem->read_depth++; + return 1; + } + return __down_trylock(&rwsem->lock, CALLER_ADDR0); +} + +/* + * release the lock: + */ +static void __up_mutex(struct rt_mutex *lock, int save_state, unsigned long eip) +{ + struct task_struct *old_owner, *new_owner; + struct rt_mutex_waiter *w; + unsigned long flags; + int prio; + + TRACE_WARN_ON(save_state != lock->save_state); + + trace_lock_irqsave(&trace_lock, flags); + TRACE_BUG_ON(!irqs_disabled()); + spin_lock(&lock->wait_lock); + TRACE_BUG_ON(!lock->wait_list.dp_node.prev && !lock->wait_list.dp_node.next); + +#ifdef CONFIG_RT_DEADLOCK_DETECT + TRACE_WARN_ON(list_empty(&lock->held_list)); + list_del_init(&lock->held_list); +#endif + spin_lock(&pi_lock); + + old_owner = lock->owner; +#ifdef ALL_TASKS_PI + if (plist_empty(&lock->wait_list)) + check_pi_list_empty(lock, old_owner); +#endif + lock->owner = NULL; + new_owner = NULL; + if (!plist_empty(&lock->wait_list)) + new_owner = pick_new_owner(lock, old_owner, save_state, eip); + + /* + * If the owner got priority-boosted then restore it + * to the previous priority (or to the next highest prio + * waiter's priority): + */ + prio = mutex_getprio(old_owner); + if (!plist_empty(&old_owner->pi_waiters)) { + w = plist_first_entry(&old_owner->pi_waiters, struct rt_mutex_waiter, pi_list); + if (w->task->prio < prio) + prio = w->task->prio; + } + if (prio != old_owner->prio) + pi_setprio(lock, old_owner, prio); + + if (new_owner) { + if (lock != &kernel_sem.lock) { + new_owner->rt_flags |= RT_PENDOWNER; + new_owner->pending_owner = lock; + } + if (save_state) + wake_up_process_mutex(new_owner); + else + wake_up_process(new_owner); + } + spin_unlock(&pi_lock); + spin_unlock(&lock->wait_lock); + +#ifdef PREEMPT_DIRECT + trace_unlock(&trace_lock); + /* + * Common place where preemption is requested - if we can + * reschedule then do it here without enabling interrupts + * again (and lengthening latency): + */ + if (need_resched() && !irqs_disabled_flags(flags) && !preempt_count()) + preempt_schedule_irq(); + local_irq_restore(flags); +#else + trace_unlock_irqrestore(&trace_lock, flags); +#endif + /* no need to check for preempt here - we just handled it */ +} + +/* + * Do owner check too: + */ +void fastcall rt_up_write(struct rw_semaphore *rwsem) +{ + WARN_ON(rwsem->lock.owner != current); + BUG_ON(rwsem->read_depth); + __up_mutex(&rwsem->lock, 0, CALLER_ADDR0); +} +EXPORT_SYMBOL(rt_up_write); + +static void _up_write(struct rw_semaphore *rwsem, unsigned long eip) +{ + WARN_ON(rwsem->lock.owner != current); + BUG_ON(rwsem->read_depth); + __up_mutex(&rwsem->lock, 0, eip); +} + +void fastcall up_write_mutex(struct rw_semaphore *rwsem, unsigned long eip) +{ + TRACE_WARN_ON(rwsem->lock.save_state != 1); + WARN_ON(rwsem->lock.owner != current); + BUG_ON(rwsem->read_depth); + __up_mutex(&rwsem->lock, 1, eip); +} + +/* + * release a read lock on the semaphore + */ +void fastcall rt_up_read(struct rw_semaphore *rwsem) +{ + /* + * Read locks within the self-held write lock succeed. + */ + if (rwsem->lock.owner == current && rwsem->read_depth) { + rwsem->read_depth--; + return; + } + return _up_write(rwsem, CALLER_ADDR0); +} +EXPORT_SYMBOL(rt_up_read); + +void fastcall up_read_mutex(struct rw_semaphore *rwsem, unsigned long eip) +{ + TRACE_WARN_ON(rwsem->lock.save_state != 1); + /* + * Read locks within the self-held write lock succeed. + */ + if (rwsem->lock.owner == current && rwsem->read_depth) { + rwsem->read_depth--; + return; + } + return up_write_mutex(rwsem, eip); +} + +/* + * downgrade a write lock into a read lock + * - just wake up any readers at the front of the queue + */ +void fastcall rt_downgrade_write(struct rw_semaphore *rwsem) +{ + BUG(); +} +EXPORT_SYMBOL(rt_downgrade_write); + +static int rt_mutex_is_locked(struct rt_mutex *lock) +{ + int ret; + + mb(); + ret = lock->owner != NULL; + + return ret; +} + +int fastcall rt_rwsem_is_locked(struct rw_semaphore *rwsem) +{ + return rt_mutex_is_locked(&rwsem->lock); +} +EXPORT_SYMBOL(rt_rwsem_is_locked); + +static void _down_mutex(struct rt_mutex *lock, unsigned long eip) +{ + TRACE_WARN_ON(lock->save_state != 1); + __down_mutex(lock, eip); +} + +void fastcall __sema_init(struct semaphore *sem, int val, + char *name, char *file, int line) +{ + atomic_set(&sem->count, val); + switch (val) { + case 0: + __init_rt_mutex(&sem->lock, 0, name, file, line); + __down(&sem->lock, CALLER_ADDR0); + break; + default: + __init_rt_mutex(&sem->lock, 0, name, file, line); + break; + } +} +EXPORT_SYMBOL(__sema_init); + +void fastcall __init_MUTEX(struct semaphore *sem, char *name, char *file, + int line) +{ + __sema_init(sem, 1, name, file, line); +} +EXPORT_SYMBOL(__init_MUTEX); + +static int down_trylock_mutex(struct rt_mutex *lock, unsigned long eip) +{ + TRACE_WARN_ON(lock->save_state != 1); + return __down_trylock(lock, eip); +} + +void fastcall up_mutex(struct rt_mutex *lock, unsigned long eip) +{ + TRACE_WARN_ON(lock->save_state != 1); + WARN_ON(lock->owner != current); + __up_mutex(lock, 1, eip); +} + +/* + * Linux Semaphores implemented via RT-mutexes. + * + * In the down() variants we use the mutex as the semaphore blocking + * object: we always acquire it, decrease the counter and keep the lock + * locked if we did the 1->0 transition. The next down() will then block. + * + * In the up() path we atomically increase the counter and do the + * unlock if we were the one doing the 0->1 transition. + */ + +static inline void __down_complete(struct semaphore *sem, unsigned long eip) +{ + int count = atomic_dec_return(&sem->count); + + TRACE_WARN_ON(sem->lock.save_state != 0); + WARN_ON(count < 0); + + if (count > 0) + __up_mutex(&sem->lock, 0, eip); +} + +void fastcall rt_down(struct semaphore *sem) +{ + TRACE_WARN_ON(sem->lock.save_state != 0); + __down(&sem->lock, CALLER_ADDR0); + __down_complete(sem, CALLER_ADDR0); +} +EXPORT_SYMBOL(rt_down); + +int fastcall rt_down_interruptible(struct semaphore *sem) +{ + int ret; + + TRACE_WARN_ON(sem->lock.save_state != 0); + ret = __down_interruptible(&sem->lock, CALLER_ADDR0); + if (ret) + return ret; + __down_complete(sem, CALLER_ADDR0); + return 0; +} +EXPORT_SYMBOL(rt_down_interruptible); + +/* + * try to down the semaphore, 0 on success and 1 on failure. (inverted) + */ +int fastcall rt_down_trylock(struct semaphore *sem) +{ + TRACE_WARN_ON(sem->lock.save_state != 0); + /* + * Here we are a tiny bit different from ordinary Linux semaphores, + * because we can get 'transient' locking-failures when say a + * process decreases the count from 9 to 8 and locks/releases the + * embedded mutex internally. It would be quite complex to remove + * these transient failures so lets try it the simple way first: + */ + if (__down_trylock(&sem->lock, CALLER_ADDR0)) { + __down_complete(sem, CALLER_ADDR0); + return 0; + } + return 1; +} +EXPORT_SYMBOL(rt_down_trylock); + +void fastcall rt_up(struct semaphore *sem) +{ + int count; + + TRACE_WARN_ON(sem->lock.save_state != 0); + /* + * Disable preemption to make sure a highprio trylock-er cannot + * preempt us here and get into an infinite loop: + */ + preempt_disable(); + count = atomic_inc_return(&sem->count); + /* + * If we did the 0 -> 1 transition then we are the ones to unlock it: + */ + if (count == 1) + __up_mutex(&sem->lock, 0, CALLER_ADDR0); + preempt_enable(); +} +EXPORT_SYMBOL(rt_up); + +int fastcall rt_sem_is_locked(struct semaphore *sem) +{ + TRACE_WARN_ON(sem->lock.save_state != 0); + return rt_mutex_is_locked(&sem->lock); +} +EXPORT_SYMBOL(rt_sem_is_locked); + +int fastcall rt_sema_count(struct semaphore *sem) +{ + TRACE_WARN_ON(sem->lock.save_state != 0); + return atomic_read(&sem->count); +} +EXPORT_SYMBOL(rt_sema_count); + +/* + * Spinlock wrappers: + */ + +static void __spin_lock(spinlock_t *lock, unsigned long eip) +{ + SAVE_BKL(_down_mutex(&lock->lock, eip)); +} + +void _spin_lock(spinlock_t *spin) +{ + __spin_lock(spin, CALLER_ADDR0); +} +EXPORT_SYMBOL(_spin_lock); + +void _spin_lock_bh(spinlock_t *spin) +{ + __spin_lock(spin, CALLER_ADDR0); +} +EXPORT_SYMBOL(_spin_lock_bh); + +void _spin_lock_irq(spinlock_t *spin) +{ + __spin_lock(spin, CALLER_ADDR0); +} +EXPORT_SYMBOL(_spin_lock_irq); + +unsigned long _spin_lock_irqsave(spinlock_t *spin) +{ + unsigned long flags; + + __spin_lock(spin, CALLER_ADDR0); + local_save_flags(flags); + + return flags; +} +EXPORT_SYMBOL(_spin_lock_irqsave); + +void _spin_unlock(spinlock_t *lock) +{ + up_mutex(&lock->lock, CALLER_ADDR0); +} +EXPORT_SYMBOL(_spin_unlock); + +void _spin_unlock_wait(spinlock_t *lock) +{ + do { + barrier(); + } while (_spin_is_locked(lock)); +} +EXPORT_SYMBOL(_spin_unlock_wait); + +void _spin_unlock_bh(spinlock_t *lock) +{ + up_mutex(&lock->lock, CALLER_ADDR0); +} +EXPORT_SYMBOL(_spin_unlock_bh); + +void _spin_unlock_irq(spinlock_t *lock) +{ + up_mutex(&lock->lock, CALLER_ADDR0); +} +EXPORT_SYMBOL(_spin_unlock_irq); + +void _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags) +{ + up_mutex(&lock->lock, CALLER_ADDR0); +} +EXPORT_SYMBOL(_spin_unlock_irqrestore); + +int _spin_trylock(spinlock_t *lock) +{ + return down_trylock_mutex(&lock->lock, CALLER_ADDR0); +} +EXPORT_SYMBOL(_spin_trylock); + +int _spin_trylock_bh(spinlock_t *lock) +{ + return down_trylock_mutex(&lock->lock, CALLER_ADDR0); +} +EXPORT_SYMBOL(_spin_trylock_bh); + +int _spin_trylock_irq(spinlock_t *lock) +{ + return down_trylock_mutex(&lock->lock, CALLER_ADDR0); +} +EXPORT_SYMBOL(_spin_trylock_irq); + +int _spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags) +{ + local_save_flags(*flags); + return down_trylock_mutex(&lock->lock, CALLER_ADDR0); +} +EXPORT_SYMBOL(_spin_trylock_irqsave); + +int _spin_is_locked(spinlock_t *lock) +{ + return rt_mutex_is_locked(&lock->lock); +} +EXPORT_SYMBOL(_spin_is_locked); + +int _spin_can_lock(spinlock_t *lock) +{ + return !rt_mutex_is_locked(&lock->lock); +} +EXPORT_SYMBOL(_spin_can_lock); + +int atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock) +{ + __spin_lock(lock, CALLER_ADDR0); + if (atomic_dec_and_test(atomic)) + return 1; + _spin_unlock(lock); + + return 0; +} +EXPORT_SYMBOL(atomic_dec_and_spin_lock); + +void _spin_lock_init(spinlock_t *lock, char *name, char *file, int line) +{ + __init_rt_mutex(&lock->lock, 1, name, file, line); +} +EXPORT_SYMBOL(_spin_lock_init); + + +/* + * RW-lock wrappers: + */ +int _read_trylock(rwlock_t *rwlock) +{ + return down_read_trylock_mutex(&rwlock->lock); +} +EXPORT_SYMBOL(_read_trylock); + +int _write_trylock(rwlock_t *rwlock) +{ + return down_write_trylock_mutex(&rwlock->lock); +} +EXPORT_SYMBOL(_write_trylock); + +void _write_lock(rwlock_t *rwlock) +{ + down_write_mutex(&rwlock->lock, CALLER_ADDR0); +} +EXPORT_SYMBOL(_write_lock); + +void _read_lock(rwlock_t *rwlock) +{ + down_read_mutex(&rwlock->lock, CALLER_ADDR0); +} +EXPORT_SYMBOL(_read_lock); + +void _write_unlock(rwlock_t *rwlock) +{ + up_write_mutex(&rwlock->lock, CALLER_ADDR0); +} +EXPORT_SYMBOL(_write_unlock); + +void _read_unlock(rwlock_t *rwlock) +{ + up_read_mutex(&rwlock->lock, CALLER_ADDR0); +} +EXPORT_SYMBOL(_read_unlock); + +unsigned long _write_lock_irqsave(rwlock_t *rwlock) +{ + unsigned long flags; + + down_write_mutex(&rwlock->lock, CALLER_ADDR0); + + local_save_flags(flags); + return flags; +} +EXPORT_SYMBOL(_write_lock_irqsave); + +unsigned long _read_lock_irqsave(rwlock_t *rwlock) +{ + unsigned long flags; + + down_read_mutex(&rwlock->lock, CALLER_ADDR0); + + local_save_flags(flags); + return flags; +} +EXPORT_SYMBOL(_read_lock_irqsave); + +void _write_lock_irq(rwlock_t *rwlock) +{ + down_write_mutex(&rwlock->lock, CALLER_ADDR0); +} +EXPORT_SYMBOL(_write_lock_irq); + +void _read_lock_irq(rwlock_t *rwlock) +{ + down_read_mutex(&rwlock->lock, CALLER_ADDR0); +} +EXPORT_SYMBOL(_read_lock_irq); + +void _write_lock_bh(rwlock_t *rwlock) +{ + down_write_mutex(&rwlock->lock, CALLER_ADDR0); +} +EXPORT_SYMBOL(_write_lock_bh); + +void _read_lock_bh(rwlock_t *rwlock) +{ + down_read_mutex(&rwlock->lock, CALLER_ADDR0); +} +EXPORT_SYMBOL(_read_lock_bh); + +void _write_unlock_irq(rwlock_t *rwlock) +{ + up_write_mutex(&rwlock->lock, CALLER_ADDR0); +} +EXPORT_SYMBOL(_write_unlock_irq); + +void _read_unlock_irq(rwlock_t *rwlock) +{ + up_read_mutex(&rwlock->lock, CALLER_ADDR0); +} +EXPORT_SYMBOL(_read_unlock_irq); + +void _write_unlock_bh(rwlock_t *rwlock) +{ + up_write_mutex(&rwlock->lock, CALLER_ADDR0); +} +EXPORT_SYMBOL(_write_unlock_bh); + +void _read_unlock_bh(rwlock_t *rwlock) +{ + up_read_mutex(&rwlock->lock, CALLER_ADDR0); +} +EXPORT_SYMBOL(_read_unlock_bh); + +void _write_unlock_irqrestore(rwlock_t *rwlock, unsigned long flags) +{ + up_write_mutex(&rwlock->lock, CALLER_ADDR0); +} +EXPORT_SYMBOL(_write_unlock_irqrestore); + +void _read_unlock_irqrestore(rwlock_t *rwlock, unsigned long flags) +{ + up_read_mutex(&rwlock->lock, CALLER_ADDR0); +} +EXPORT_SYMBOL(_read_unlock_irqrestore); + +void _rwlock_init(rwlock_t *rwlock, char *name, char *file, int line) +{ + __init_rwsem(&rwlock->lock, 1, name, file, line); +} +EXPORT_SYMBOL(_rwlock_init); + +int _rwlock_is_locked(rwlock_t *rwlock) +{ + return rt_rwsem_is_locked(&rwlock->lock); +} +EXPORT_SYMBOL(_rwlock_is_locked); + +/* + * TODO: is it ok if _read_can_lock() and _write_can_lock() does the same? + */ +int _read_can_lock(rwlock_t *rwlock) +{ + return !rt_rwsem_is_locked(&rwlock->lock); +} +EXPORT_SYMBOL(_read_can_lock); + +int _write_can_lock(rwlock_t *rwlock) +{ + return !rt_rwsem_is_locked(&rwlock->lock); +} +EXPORT_SYMBOL(_write_can_lock); + Index: linux/kernel/time.c =================================================================== --- linux.orig/kernel/time.c 2005-06-06 10:18:16.000000000 -0300 +++ linux/kernel/time.c 2005-06-06 16:04:14.000000000 -0300 @@ -97,8 +97,31 @@ #endif /* __ARCH_WANT_SYS_TIME */ +int timeofday_API_hacks(struct timeval __user *tv, struct timezone __user *tz) +{ +#ifdef CONFIG_LATENCY_TRACE + if (!tv && ((long)tz == 1)) + return user_trace_start(); + if (!tv && !tz) + return user_trace_stop(); +#endif + if (((long)tv == 1) && ((long)tz == 1)) { + current->flags |= PF_NOSCHED; + return 0; + } + if (((long)tv == 1) && ((long)tz == 0)) { + current->flags &= ~PF_NOSCHED; + return 0; + } + return 1; +} + asmlinkage long sys_gettimeofday(struct timeval __user *tv, struct timezone __user *tz) { + int ret = timeofday_API_hacks(tv, tz); + if (ret != 1) + return ret; + if (likely(tv != NULL)) { struct timeval ktv; do_gettimeofday(&ktv); @@ -184,6 +207,10 @@ struct timespec new_ts; struct timezone new_tz; + int ret = timeofday_API_hacks(tv, tz); + if (ret != 1) + return ret; + if (tv) { if (copy_from_user(&user_tv, tv, sizeof(*tv))) return -EFAULT; Index: linux/kernel/exit.c =================================================================== --- linux.orig/kernel/exit.c 2005-06-06 10:18:16.000000000 -0300 +++ linux/kernel/exit.c 2005-06-06 16:04:14.000000000 -0300 @@ -49,8 +49,11 @@ if (thread_group_leader(p)) { detach_pid(p, PIDTYPE_PGID); detach_pid(p, PIDTYPE_SID); - if (p->pid) + if (p->pid) { + preempt_disable(); __get_cpu_var(process_counts)--; + preempt_enable(); + } } REMOVE_LINKS(p); @@ -374,8 +377,10 @@ while (set) { if (set & 1) { struct file * file = xchg(&files->fd[i], NULL); - if (file) + if (file) { filp_close(file, files); + cond_resched(); + } } i++; set >>= 1; @@ -505,9 +510,11 @@ if (mm != tsk->active_mm) BUG(); /* more a memory barrier than a real lock */ task_lock(tsk); + preempt_disable(); // FIXME tsk->mm = NULL; up_read(&mm->mmap_sem); enter_lazy_tlb(mm, current); + preempt_enable(); task_unlock(tsk); mmput(mm); } @@ -766,10 +773,6 @@ /* If the process is dead, release it - nobody will wait for it */ if (state == EXIT_DEAD) release_task(tsk); - - /* PF_DEAD causes final put_task_struct after we schedule. */ - preempt_disable(); - tsk->flags |= PF_DEAD; } fastcall NORET_TYPE void do_exit(long code) @@ -838,12 +841,18 @@ mpol_free(tsk->mempolicy); tsk->mempolicy = NULL; #endif - - BUG_ON(!(current->flags & PF_DEAD)); - schedule(); - BUG(); - /* Avoid "noreturn function does return". */ - for (;;) ; + check_no_held_locks(tsk); + /* PF_DEAD causes final put_task_struct after we schedule. */ +again: + local_irq_disable(); + tsk->flags |= PF_DEAD; + __schedule(); + printk(KERN_ERR "BUG: dead task %s:%d back from the grave!\n", + current->comm, current->pid); + printk(KERN_ERR ".... flags: %08lx, count: %d, state: %08lx\n", + current->flags, atomic_read(¤t->usage), current->state); + printk(KERN_ERR ".... trying again ...\n"); + goto again; } EXPORT_SYMBOL_GPL(do_exit); @@ -1343,6 +1352,7 @@ list_for_each(_p,&tsk->children) { p = list_entry(_p,struct task_struct,sibling); + BUG_ON(!atomic_read(&p->usage)); ret = eligible_child(pid, options, p); if (!ret) continue; Index: linux/kernel/printk.c =================================================================== --- linux.orig/kernel/printk.c 2005-06-06 10:18:16.000000000 -0300 +++ linux/kernel/printk.c 2005-06-06 16:04:14.000000000 -0300 @@ -83,7 +83,7 @@ * It is also used in interesting ways to provide interlocking in * release_console_sem(). */ -static DEFINE_SPINLOCK(logbuf_lock); +static DEFINE_RAW_SPINLOCK(logbuf_lock); #define LOG_BUF_MASK (log_buf_len-1) #define LOG_BUF(idx) (log_buf[(idx) & LOG_BUF_MASK]) @@ -363,10 +363,12 @@ { struct console *con; + touch_critical_timing(); for (con = console_drivers; con; con = con->next) { if ((con->flags & CON_ENABLED) && con->write) con->write(con, &LOG_BUF(start), end - start); } + touch_critical_timing(); } /* @@ -470,6 +472,7 @@ spin_lock_init(&logbuf_lock); /* And make sure that we print immediately */ init_MUTEX(&console_sem); + zap_rt_locks(); } #if defined(CONFIG_PRINTK_TIME) @@ -738,8 +741,17 @@ } console_locked = 0; console_may_schedule = 0; - up(&console_sem); spin_unlock_irqrestore(&logbuf_lock, flags); + up(&console_sem); + /* + * On PREEMPT_RT kernels __wake_up may sleep, so wake syslogd + * up only if we are in a preemptible section. We normally dont + * printk from non-preemptible sections so this is for the emergency + * case only. + */ +#ifdef CONFIG_PREEMPT_RT + if (!in_atomic() && !irqs_disabled()) +#endif if (wake_klogd && !oops_in_progress && waitqueue_active(&log_wait)) wake_up_interruptible(&log_wait); } @@ -970,7 +982,7 @@ */ int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst) { - static DEFINE_SPINLOCK(ratelimit_lock); + static DEFINE_RAW_SPINLOCK(ratelimit_lock); static unsigned long toks = 10*5*HZ; static unsigned long last_msg; static int missed; Index: linux/kernel/posix-timers.c =================================================================== --- linux.orig/kernel/posix-timers.c 2005-06-06 10:18:16.000000000 -0300 +++ linux/kernel/posix-timers.c 2005-06-06 16:04:14.000000000 -0300 @@ -94,7 +94,7 @@ */ #define TIMER_INACTIVE 1 -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_SOFTIRQS) # define timer_active(tmr) \ ((tmr)->it.real.timer.entry.prev != (void *)TIMER_INACTIVE) # define set_timer_inactive(tmr) \ @@ -102,10 +102,28 @@ (tmr)->it.real.timer.entry.prev = (void *)TIMER_INACTIVE; \ } while (0) #else -# define timer_active(tmr) BARFY // error to use outside of SMP +# define timer_active(tmr) BARFY /* error to use outside of SMP | RT */ # define set_timer_inactive(tmr) do { } while (0) #endif /* + * For RT the timer call backs are preemptable. This means that folks + * trying to delete timers may run into timers that are "active" for + * long times. To help out with this we provide a wake up function to + * wake up a caller who wants waking when a timer clears the call back. + * This is the same sort of thing that the del_timer_sync does, but we + * need (in the HRT case) to cover two lists and not just the one. + */ +#ifdef CONFIG_PREEMPT_SOFTIRQS +#include +static DECLARE_WAIT_QUEUE_HEAD(timer_wake_queue); +#define wake_timer_waiters() wake_up(&timer_wake_queue) +#define wait_for_timer(timer) wait_event(timer_wake_queue, !timer_active(timer)) + +#else +#define wake_timer_waiters() +#define wait_for_timer(timer) +#endif +/* * we assume that the new SIGEV_THREAD_ID shares no bits with the other * SIGEV values. Here we put out an error if this assumption fails. */ @@ -527,6 +545,7 @@ schedule_next_timer(timr); } unlock_timer(timr, flags); /* hold thru abs lock to keep irq off */ + wake_timer_waiters(); } @@ -983,18 +1002,20 @@ * careful here. If smp we could be in the "fire" routine which will * be spinning as we hold the lock. But this is ONLY an SMP issue. */ -#ifdef CONFIG_SMP - if (timer_active(timr) && !del_timer(&timr->it.real.timer)) - /* - * It can only be active if on an other cpu. Since - * we have cleared the interval stuff above, it should - * clear once we release the spin lock. Of course once - * we do that anything could happen, including the - * complete melt down of the timer. So return with - * a "retry" exit status. - */ - return TIMER_RETRY; - +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_SOFTIRQS) + if (timer_active(timr) && !del_timer(&timr->it.real.timer)) { + /* + * It can only be active if on an other cpu (unless RT). + * Since we have cleared the interval stuff above, it + * should clear once we release the spin lock. Of + * course once we do that anything could happen, + * including the complete melt down of the timer. So + * return with a "retry" exit status. If RT we do a + * formal wait as the function code is fully + * preemptable... + */ + return TIMER_RETRY; + } set_timer_inactive(timr); #else del_timer(&timr->it.real.timer); @@ -1069,7 +1090,8 @@ unlock_timer(timr, flag); if (error == TIMER_RETRY) { - rtn = NULL; // We already got the old time... + wait_for_timer(timr); + rtn = NULL; /* We already got the old time... */ goto retry; } @@ -1083,17 +1105,19 @@ static inline int common_timer_del(struct k_itimer *timer) { timer->it.real.incr = 0; -#ifdef CONFIG_SMP - if (timer_active(timer) && !del_timer(&timer->it.real.timer)) - /* - * It can only be active if on an other cpu. Since - * we have cleared the interval stuff above, it should - * clear once we release the spin lock. Of course once - * we do that anything could happen, including the - * complete melt down of the timer. So return with - * a "retry" exit status. - */ - return TIMER_RETRY; +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_SOFTIRQS) + if (timer_active(timer) && !del_timer(&timer->it.real.timer)) { + /* + * It can only be active if on an other cpu (unless RT). + * Since we have cleared the interval stuff above, it + * should clear once we release the spin lock. Of + * course once we do that anything could happen, + * including the complete melt down of the timer. So + * return with a "retry" exit status. For RT we do a + * formal wait as it could take a while. + */ + return TIMER_RETRY; + } #else del_timer(&timer->it.real.timer); #endif @@ -1114,7 +1138,7 @@ struct k_itimer *timer; long flags; -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_SOFTIRQS) int error; retry_delete: #endif @@ -1122,7 +1146,7 @@ if (!timer) return -EINVAL; -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_SOFTIRQS) error = timer_delete_hook(timer); if (error == TIMER_RETRY) { @@ -1155,17 +1179,18 @@ { unsigned long flags; -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_SOFTIRQS) int error; retry_delete: #endif spin_lock_irqsave(&timer->it_lock, flags); -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_SOFTIRQS) error = timer_delete_hook(timer); if (error == TIMER_RETRY) { unlock_timer(timer, flags); + wait_for_timer(timer); goto retry_delete; } #else @@ -1424,6 +1449,14 @@ list_del_init(&timr->it.real.abs_timer_entry); if (add_clockset_delta(timr, &new_wall_to) && del_timer(&timr->it.real.timer)) /* timer run yet? */ + /* + * Note that we only do this if the timer is/was + * in the list. If it happens to be active an + * not in the timer list, it must be in the call + * back function, we leave it to that code to do + * the right thing. I.e we do NOT need + * del_timer_sync() + */ add_timer(&timr->it.real.timer); list_add(&timr->it.real.abs_timer_entry, &abs_list.list); spin_unlock_irq(&abs_list.lock); Index: linux/kernel/rcupdate.c =================================================================== --- linux.orig/kernel/rcupdate.c 2005-06-06 10:18:16.000000000 -0300 +++ linux/kernel/rcupdate.c 2005-06-06 16:04:14.000000000 -0300 @@ -479,3 +479,39 @@ EXPORT_SYMBOL(call_rcu_bh); /* WARNING: GPL-only in April 2006. */ EXPORT_SYMBOL_GPL(synchronize_rcu); EXPORT_SYMBOL(synchronize_kernel); /* WARNING: GPL-only in April 2006. */ + +#ifdef CONFIG_PREEMPT_RCU + +void rcu_read_lock(void) +{ + if (current->rcu_read_lock_nesting++ == 0) { + current->rcu_data = &get_cpu_var(rcu_data); + atomic_inc(¤t->rcu_data->active_readers); + smp_mb__after_atomic_inc(); + put_cpu_var(rcu_data); + } +} +EXPORT_SYMBOL(rcu_read_lock); + +void rcu_read_unlock(void) +{ + int cpu; + + if (--current->rcu_read_lock_nesting == 0) { + atomic_dec(¤t->rcu_data->active_readers); + smp_mb__after_atomic_dec(); + /* + * Check whether we have reached quiescent state. + * Note! This is only for the local CPU, not for + * current->rcu_data's CPU [which typically is the + * current CPU, but may also be another CPU]. + */ + cpu = get_cpu(); + rcu_qsctr_inc(cpu); + put_cpu(); + } +} +EXPORT_SYMBOL(rcu_read_unlock); + +#endif + Index: linux/kernel/sched.c =================================================================== --- linux.orig/kernel/sched.c 2005-06-06 10:18:16.000000000 -0300 +++ linux/kernel/sched.c 2005-06-06 16:04:14.000000000 -0300 @@ -4,6 +4,7 @@ * Kernel scheduler and related syscalls * * Copyright (C) 1991-2002 Linus Torvalds + * Copyright (C) 2004 Red Hat, Inc., Ingo Molnar * * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and * make semaphores SMP safe @@ -16,6 +17,7 @@ * by Davide Libenzi, preemptible kernel bits by Robert Love. * 2003-09-03 Interactivity tuning by Con Kolivas. * 2004-04-02 Scheduler domains code by Nick Piggin + * 2004-10-13 Real-Time Preemption support by Ingo Molnar */ #include @@ -46,6 +48,7 @@ #include #include #include +#include #include #include @@ -185,6 +188,7 @@ typedef struct runqueue runqueue_t; struct prio_array { + runqueue_t *rq; unsigned int nr_active; unsigned long bitmap[BITMAP_SIZE]; struct list_head queue[MAX_PRIO]; @@ -198,7 +202,7 @@ * acquire operations must be ordered by ascending &runqueue. */ struct runqueue { - spinlock_t lock; + raw_spinlock_t lock; /* * nr_running and cpu_load should be in the same cacheline because @@ -206,6 +210,9 @@ */ unsigned long nr_running; #ifdef CONFIG_SMP +#ifdef CONFIG_PREEMPT_RT + unsigned long rt_nr_running; +#endif unsigned long cpu_load; #endif unsigned long long nr_switches; @@ -269,11 +276,23 @@ #define cpu_curr(cpu) (cpu_rq(cpu)->curr) /* + * We really dont want to do anything complex within switch_to() + * on PREEMPT_RT - this check enforces this. + */ +#ifdef prepare_arch_switch +# ifdef CONFIG_PREEMPT_RT +# error FIXME +# else +# define _finish_arch_switch finish_arch_switch +# endif +#endif + +/* * Default context-switch locking: */ #ifndef prepare_arch_switch # define prepare_arch_switch(rq, next) do { } while (0) -# define finish_arch_switch(rq, next) spin_unlock_irq(&(rq)->lock) +# define _finish_arch_switch(rq, next) spin_unlock(&(rq)->lock) # define task_running(rq, p) ((rq)->curr == (p)) #endif @@ -536,6 +555,33 @@ #define sched_info_switch(t, next) do { } while (0) #endif /* CONFIG_SCHEDSTATS */ +int rt_overload_schedule, rt_overload_wakeup, rt_overload_pulled; + +__cacheline_aligned_in_smp atomic_t rt_overload; + +static inline void inc_rt_tasks(task_t *p, runqueue_t *rq) +{ +#if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP) + if (rt_task(p)) { + rq->rt_nr_running++; + if (rq->rt_nr_running == 2) + atomic_inc(&rt_overload); + } +#endif +} + +static inline void dec_rt_tasks(task_t *p, runqueue_t *rq) +{ +#if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP) + if (rt_task(p)) { + WARN_ON(!rq->rt_nr_running); + rq->rt_nr_running--; + if (rq->rt_nr_running == 1) + atomic_dec(&rt_overload); + } +#endif +} + /* * Adding/removing a task to/from a priority array: */ @@ -545,15 +591,21 @@ list_del(&p->run_list); if (list_empty(array->queue + p->prio)) __clear_bit(p->prio, array->bitmap); + dec_rt_tasks(p, array->rq); } static void enqueue_task(struct task_struct *p, prio_array_t *array) { + if (p->flags & PF_DEAD) { + printk("BUG: %s/%d: dead task enqueued!\n", p->comm, p->pid); + dump_stack(); + } sched_info_queued(p); list_add_tail(&p->run_list, array->queue + p->prio); __set_bit(p->prio, array->bitmap); array->nr_active++; p->array = array; + inc_rt_tasks(p, array->rq); } /* @@ -587,13 +639,11 @@ * * Both properties are important to certain workloads. */ -static int effective_prio(task_t *p) + +static inline i