diff -Nru linux-2.4.13.vanilla/Makefile linux-2.4.13.latxs/Makefile
--- linux-2.4.13.vanilla/Makefile	Tue Oct 23 22:21:20 2001
+++ linux-2.4.13.latxs/Makefile	Thu Oct 25 11:38:53 2001
@@ -1,7 +1,7 @@
 VERSION = 2
 PATCHLEVEL = 4
 SUBLEVEL = 13
-EXTRAVERSION =
+EXTRAVERSION = latxs
 
 KERNELRELEASE=$(VERSION).$(PATCHLEVEL).$(SUBLEVEL)$(EXTRAVERSION)
 
diff -Nru linux-2.4.13.vanilla/arch/i386/kernel/process.c linux-2.4.13.latxs/arch/i386/kernel/process.c
--- linux-2.4.13.vanilla/arch/i386/kernel/process.c	Thu Oct  4 18:42:54 2001
+++ linux-2.4.13.latxs/arch/i386/kernel/process.c	Thu Oct 25 11:35:54 2001
@@ -135,6 +135,9 @@
 			idle();
 		schedule();
 		check_pgt_cache();
+#ifdef CONFIG_SMP
+		runqueue_balance(IDLE_RQBALANCE);
+#endif	/* #ifdef CONFIG_SMP */
 	}
 }
 
diff -Nru linux-2.4.13.vanilla/arch/i386/kernel/smpboot.c linux-2.4.13.latxs/arch/i386/kernel/smpboot.c
--- linux-2.4.13.vanilla/arch/i386/kernel/smpboot.c	Thu Oct  4 18:42:54 2001
+++ linux-2.4.13.latxs/arch/i386/kernel/smpboot.c	Thu Oct 25 11:35:55 2001
@@ -771,7 +771,7 @@
 
 extern unsigned long cpu_initialized;
 
-static void __init do_boot_cpu (int apicid) 
+static void __init do_boot_cpu (int apicid)
 /*
  * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
  * (ie clustered apic addressing mode), this is a LOGICAL apic ID.
@@ -799,15 +799,14 @@
 	if (!idle)
 		panic("No idle process for CPU %d", cpu);
 
-	idle->processor = cpu;
-
 	map_cpu_to_boot_apicid(cpu, apicid);
 
-	idle->has_cpu = 1; /* we schedule the first task manually */
-	idle->thread.eip = (unsigned long) start_secondary;
-
 	del_from_runqueue(idle);
 	unhash_process(idle);
+
+	idle->has_cpu = 1; /* we schedule the first task manually */
+	idle->thread.eip = (unsigned long) start_secondary;
+	idle->processor = cpu;
 	init_tasks[cpu] = idle;
 
 	/* start_eip had better be page-aligned! */
@@ -830,7 +829,7 @@
 		/* stash the current NMI vector, so we can put things back */
 		nmi_high = *((volatile unsigned short *) TRAMPOLINE_HIGH);
 		nmi_low = *((volatile unsigned short *) TRAMPOLINE_LOW);
-	} 
+	}
 
 	CMOS_WRITE(0xa, 0xf);
 	local_flush_tlb();
diff -Nru linux-2.4.13.vanilla/drivers/char/Makefile linux-2.4.13.latxs/drivers/char/Makefile
--- linux-2.4.13.vanilla/drivers/char/Makefile	Mon Oct 15 13:36:48 2001
+++ linux-2.4.13.latxs/drivers/char/Makefile	Thu Oct 25 11:37:50 2001
@@ -16,7 +16,7 @@
 
 O_TARGET := char.o
 
-obj-y	 += mem.o tty_io.o n_tty.o tty_ioctl.o raw.o pty.o misc.o random.o
+obj-y	 += mem.o tty_io.o n_tty.o tty_ioctl.o raw.o pty.o misc.o random.o latsched.o
 
 # All of the (potential) objects that export symbols.
 # This list comes from 'grep -l EXPORT_SYMBOL *.[hc]'.
diff -Nru linux-2.4.13.vanilla/drivers/char/latsched.c linux-2.4.13.latxs/drivers/char/latsched.c
--- linux-2.4.13.vanilla/drivers/char/latsched.c	Wed Dec 31 16:00:00 1969
+++ linux-2.4.13.latxs/drivers/char/latsched.c	Thu Oct 25 11:37:50 2001
@@ -0,0 +1,177 @@
+/*
+ *  linux/kernel/latsched.c
+ *
+ *  Kernel scheduler latency tester
+ *
+ *  Copyright (C) 2001, Davide Libenzi <davidel@xmailserver.org>
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/signal.h>
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/malloc.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+#include <linux/poll.h>
+#include <linux/miscdevice.h>
+#include <linux/random.h>
+#include <linux/smp_lock.h>
+#include <linux/wrapper.h>
+#include <linux/string.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/wait.h>
+#include <asm/bitops.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <asm/io.h>
+#include <asm/atomic.h>
+
+#include <linux/latsched.h>
+
+
+
+
+
+#define DEBUG	0
+#ifdef DEBUG
+#define DPRINTK(x)	printk x
+#define DNPRINTK(n,x)	if (n <= DEBUG) printk x
+#else
+#define DPRINTK(x)
+#define DNPRINTK(n,x)
+#endif
+
+
+
+struct latsched {
+
+};
+
+
+
+static int open_latsched(struct inode *inode, struct file *file);
+static int close_latsched(struct inode *inode, struct file *file);
+static int ioctl_latsched(struct inode *inode, struct file *file,
+		unsigned int cmd, unsigned long arg);
+
+
+static struct file_operations latsched_fops = {
+	ioctl: ioctl_latsched,
+	open: open_latsched,
+	release: close_latsched
+};
+
+static struct miscdevice latsched = {
+	LATSCHED_MINOR, "latsched", &latsched_fops
+};
+
+
+
+
+
+
+
+static int open_latsched(struct inode *inode, struct file *file)
+{
+	int res;
+	struct latsched *ls;
+
+	if (!(ls = kmalloc(sizeof(struct latsched), GFP_KERNEL)))
+		return -ENOMEM;
+
+	memset(ls, 0, sizeof(*ls));
+
+
+	file->private_data = ls;
+
+	MOD_INC_USE_COUNT;
+
+	DNPRINTK(3, (KERN_INFO "[%p] /dev/latsched: open() ls=%p\n", current, ls));
+	return 0;
+}
+
+
+static int close_latsched(struct inode *inode, struct file *file)
+{
+	struct latsched *ls = (struct latsched *) file->private_data;
+
+	kfree(ls);
+
+	MOD_DEC_USE_COUNT;
+
+	DNPRINTK(3, (KERN_INFO "[%p] /dev/latsched: close() ls=%p\n", current, ls));
+	return 0;
+}
+
+
+static int ioctl_latsched(struct inode *inode, struct file *file,
+		unsigned int cmd, unsigned long arg)
+{
+	int res;
+	struct latsched *ls = (struct latsched *) file->private_data;
+	struct lsctl_getdata lsgd;
+
+	switch (cmd) {
+	case LS_START:
+		res = latsched_start(1);
+
+		DNPRINTK(3, (KERN_INFO "[%p] /dev/latsched: ioctl(%p, LS_START) == %d\n",
+				current, ls, res));
+		return res;
+
+	case LS_STOP:
+		res = latsched_start(0);
+
+		DNPRINTK(3, (KERN_INFO "[%p] /dev/latsched: ioctl(%p, LS_STOP) == %d\n",
+				current, ls, res));
+		return res;
+
+	case LS_FETCH:
+		if ((res = verify_area(VERIFY_WRITE, (void *) arg, sizeof(struct lsctl_getdata))))
+			return res;
+		__copy_from_user(&lsgd, (void *) arg, sizeof(struct lsctl_getdata));
+		if ((res = verify_area(VERIFY_WRITE, (void *) lsgd.data, lsgd.size * sizeof(struct latsched_sample))))
+			return res;
+
+		if (!(res = latsched_getdata(&lsgd)))
+			__copy_to_user((void *) arg, &lsgd, sizeof(struct lsctl_getdata));
+
+		DNPRINTK(3, (KERN_INFO "[%p] /dev/latsched: ioctl(%p, LS_FETCH, %d) == %d\n",
+				current, ls, lsgd.cpu, res));
+		return res;
+
+	case LS_SAMPLES:
+		res = latsched_setsamples((int) arg);
+
+		DNPRINTK(3, (KERN_INFO "[%p] /dev/latsched: ioctl(%p, LS_SAMPLES, %lu) == %d\n",
+				current, ls, arg, res));
+		return res;
+	}
+
+	return -EINVAL;
+}
+
+
+
+
+int __init init_latsched(void)
+{
+
+	misc_register(&latsched);
+
+	printk(KERN_INFO "[%p] /dev/latsched: driver installed.\n", current);
+
+	return 0;
+}
+
+
+module_init(init_latsched);
+
diff -Nru linux-2.4.13.vanilla/include/linux/latsched.h linux-2.4.13.latxs/include/linux/latsched.h
--- linux-2.4.13.vanilla/include/linux/latsched.h	Wed Dec 31 16:00:00 1969
+++ linux-2.4.13.latxs/include/linux/latsched.h	Fri Oct 26 16:49:27 2001
@@ -0,0 +1,41 @@
+/*
+ *  linux/include/linux/latsched.h
+ *
+ *  Kernel scheduler latency tester
+ *
+ *  Copyright (C) 2001, Davide Libenzi <davidel@xmailserver.org>
+ *
+ */
+
+#ifndef _LINUX_LATSCHED_H
+#define _LINUX_LATSCHED_H
+
+#include <asm/timex.h>
+
+#define LATSCHED_MINOR	117
+#define STD_LATSCHED_SAMPLES	1024
+
+struct latsched_sample {
+	cycles_t lss_in, lss_out;
+	pid_t lss_pid;
+};
+struct latsched_data {
+	struct latsched_sample *ls_data;
+	int ls_size;
+	int ls_curr;
+};
+struct lsctl_getdata {
+	int cpu;
+	int size;
+	struct latsched_sample *data;
+	int rsize;
+};
+
+#define LS_START	_IO('P', 1)
+#define LS_STOP		_IO('P', 2)
+#define LS_FETCH	_IOWR('P', 3, struct lsctl_getdata)
+#define LS_SAMPLES	_IOR('P', 4, int)
+
+
+#endif	/* #ifndef _LINUX_LATSCHED_H */
+
diff -Nru linux-2.4.13.vanilla/include/linux/sched.h linux-2.4.13.latxs/include/linux/sched.h
--- linux-2.4.13.vanilla/include/linux/sched.h	Tue Oct 23 21:59:06 2001
+++ linux-2.4.13.latxs/include/linux/sched.h	Fri Oct 26 16:49:27 2001
@@ -15,6 +15,7 @@
 #include <linux/rbtree.h>
 
 #include <asm/system.h>
+#include <asm/atomic.h>
 #include <asm/semaphore.h>
 #include <asm/page.h>
 #include <asm/ptrace.h>
@@ -26,6 +27,7 @@
 #include <linux/signal.h>
 #include <linux/securebits.h>
 #include <linux/fs_struct.h>
+#include <linux/latsched.h>
 
 struct exec_domain;
 
@@ -72,7 +74,10 @@
 #define CT_TO_SECS(x)	((x) / HZ)
 #define CT_TO_USECS(x)	(((x) % HZ) * 1000000/HZ)
 
-extern int nr_running, nr_threads;
+#define nr_running	atomic_read(&gnr_running)
+
+extern atomic_t gnr_running;
+extern int nr_threads;
 extern int last_pid;
 
 #include <linux/fs.h>
@@ -139,9 +144,12 @@
  * a separate lock).
  */
 extern rwlock_t tasklist_lock;
-extern spinlock_t runqueue_lock;
 extern spinlock_t mmlist_lock;
 
+extern void latsched_init(void);
+extern int latsched_start(int on);
+extern int latsched_setsamples(int nsamps);
+extern int latsched_getdata(struct lsctl_getdata *lsgd);
 extern void sched_init(void);
 extern void init_idle(void);
 extern void show_state(void);
@@ -312,6 +320,7 @@
 	 */
 	struct list_head run_list;
 	unsigned long sleep_time;
+	unsigned long cpu_jtime, sched_jtime;
 
 	struct task_struct *next_task, *prev_task;
 	struct mm_struct *active_mm;
@@ -332,9 +341,9 @@
 	pid_t tgid;
 	/* boolean value for session group leader */
 	int leader;
-	/* 
+	/*
 	 * pointers to (original) parent process, youngest child, younger sibling,
-	 * older sibling, respectively.  (p->father can be replaced with 
+	 * older sibling, respectively.  (p->father can be replaced with
 	 * p->p_pptr->pid)
 	 */
 	struct task_struct *p_opptr, *p_pptr, *p_cptr, *p_ysptr, *p_osptr;
@@ -393,12 +402,15 @@
 	int (*notifier)(void *priv);
 	void *notifier_data;
 	sigset_t *notifier_mask;
-	
+
 /* Thread group tracking */
    	u32 parent_exec_id;
    	u32 self_exec_id;
 /* Protection of (de-)allocation: mm, files, fs, tty */
 	spinlock_t alloc_lock;
+/* a better place for these brothers must be found */
+	int move_to_cpu;
+	struct list_head proclist_cpu;
 };
 
 /*
@@ -485,7 +497,10 @@
     sig:		&init_signals,					\
     pending:		{ NULL, &tsk.pending.head, {{0}}},		\
     blocked:		{{0}},						\
-    alloc_lock:		SPIN_LOCK_UNLOCKED				\
+    alloc_lock:		SPIN_LOCK_UNLOCKED,				\
+	move_to_cpu:	0,						\
+	proclist_cpu:	LIST_HEAD_INIT(tsk.proclist_cpu),			\
+	cpu_jtime:		0,						\
 }
 
 
@@ -765,6 +780,20 @@
 extern void FASTCALL(add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t * wait));
 extern void FASTCALL(remove_wait_queue(wait_queue_head_t *q, wait_queue_t * wait));
 
+extern void del_from_runqueue(struct task_struct * p);
+extern void add_to_proclist(struct task_struct * p);
+extern void del_from_proclist(struct task_struct * p);
+extern int move_to_cpu(struct task_struct * p, int cpu, int stick);
+extern int get_best_cpu(void);
+extern int runqueue_balance(int mode);
+extern void runqueue_spin_lock(struct task_struct * p);
+extern void runqueue_spin_unlock(struct task_struct * p);
+
+
+#define IDLE_RQBALANCE	0
+
+
+
 #define __wait_event(wq, condition) 					\
 do {									\
 	wait_queue_t __wait;						\
@@ -808,7 +837,7 @@
 	current->state = TASK_RUNNING;					\
 	remove_wait_queue(&wq, &__wait);				\
 } while (0)
-	
+
 #define wait_event_interruptible(wq, condition)				\
 ({									\
 	int __ret = 0;							\
@@ -818,6 +847,7 @@
 })
 
 #define REMOVE_LINKS(p) do { \
+	del_from_proclist(p); \
 	(p)->next_task->prev_task = (p)->prev_task; \
 	(p)->prev_task->next_task = (p)->next_task; \
 	if ((p)->p_osptr) \
@@ -829,6 +859,7 @@
 	} while (0)
 
 #define SET_LINKS(p) do { \
+	add_to_proclist(p); \
 	(p)->next_task = &init_task; \
 	(p)->prev_task = init_task.prev_task; \
 	init_task.prev_task->next_task = (p); \
@@ -845,13 +876,6 @@
 #define next_thread(p) \
 	list_entry((p)->thread_group.next, struct task_struct, thread_group)
 
-static inline void del_from_runqueue(struct task_struct * p)
-{
-	nr_running--;
-	p->sleep_time = jiffies;
-	list_del(&p->run_list);
-	p->run_list.next = NULL;
-}
 
 static inline int task_on_runqueue(struct task_struct *p)
 {
diff -Nru linux-2.4.13.vanilla/init/main.c linux-2.4.13.latxs/init/main.c
--- linux-2.4.13.vanilla/init/main.c	Fri Oct 12 10:17:15 2001
+++ linux-2.4.13.latxs/init/main.c	Thu Oct 25 11:37:50 2001
@@ -573,7 +573,7 @@
 		/* only text is profiled */
 		prof_len = (unsigned long) &_etext - (unsigned long) &_stext;
 		prof_len >>= prof_shift;
-		
+
 		size = prof_len * sizeof(unsigned int) + PAGE_SIZE-1;
 		prof_buffer = (unsigned int *) alloc_bootmem(size);
 	}
@@ -611,12 +611,13 @@
 	check_bugs();
 	printk("POSIX conformance testing by UNIFIX\n");
 
-	/* 
-	 *	We count on the initial thread going ok 
+	/*
+	 *	We count on the initial thread going ok
 	 *	Like idlers init is an unlocked kernel thread, which will
 	 *	make syscalls (and thus be locked).
 	 */
 	smp_init();
+	latsched_init();
 	rest_init();
 }
 
diff -Nru linux-2.4.13.vanilla/kernel/fork.c linux-2.4.13.latxs/kernel/fork.c
--- linux-2.4.13.vanilla/kernel/fork.c	Tue Oct 23 17:44:15 2001
+++ linux-2.4.13.latxs/kernel/fork.c	Thu Oct 25 17:42:38 2001
@@ -21,6 +21,7 @@
 #include <linux/completion.h>
 #include <linux/personality.h>
 
+#include <asm/atomic.h>
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
 #include <asm/uaccess.h>
@@ -28,7 +29,7 @@
 
 /* The idle threads do not count.. */
 int nr_threads;
-int nr_running;
+atomic_t gnr_running = ATOMIC_INIT(0);
 
 int max_threads;
 unsigned long total_forks;	/* Handle normal Linux uptimes. */
@@ -598,7 +599,7 @@
 	 */
 	if (nr_threads >= max_threads)
 		goto bad_fork_cleanup_count;
-	
+
 	get_exec_domain(p->exec_domain);
 
 	if (p->binfmt && p->binfmt->module)
@@ -639,7 +640,7 @@
 	{
 		int i;
 		p->has_cpu = 0;
-		p->processor = current->processor;
+		p->processor = clone_flags & CLONE_PID ? current->processor: get_best_cpu();
 		/* ?? should we just memset this ?? */
 		for(i = 0; i < smp_num_cpus; i++)
 			p->per_cpu_utime[i] = p->per_cpu_stime[i] = 0;
@@ -665,10 +666,10 @@
 	if (retval)
 		goto bad_fork_cleanup_mm;
 	p->semundo = NULL;
-	
+
 	/* Our parent execution domain becomes current domain
 	   These must match for thread signalling to apply */
-	   
+
 	p->parent_exec_id = p->self_exec_id;
 
 	/* ok, now we should be set up.. */
@@ -687,6 +688,10 @@
 	if (!current->counter)
 		current->need_resched = 1;
 
+	p->cpu_jtime = 0;
+	p->sched_jtime = jiffies;
+	p->move_to_cpu = 0;
+
 	/*
 	 * Ok, add it to the run-queues and make it
 	 * visible to the rest of the system.
@@ -774,7 +779,7 @@
 		panic("Cannot create signal action SLAB cache");
 
 	files_cachep = kmem_cache_create("files_cache", 
-			 sizeof(struct files_struct), 0, 
+			 sizeof(struct files_struct), 0,
 			 SLAB_HWCACHE_ALIGN, NULL, NULL);
 	if (!files_cachep) 
 		panic("Cannot create files SLAB cache");
diff -Nru linux-2.4.13.vanilla/kernel/ksyms.c linux-2.4.13.latxs/kernel/ksyms.c
--- linux-2.4.13.vanilla/kernel/ksyms.c	Wed Oct 17 14:32:50 2001
+++ linux-2.4.13.latxs/kernel/ksyms.c	Thu Oct 25 11:34:51 2001
@@ -443,7 +443,6 @@
 #endif
 
 EXPORT_SYMBOL(kstat);
-EXPORT_SYMBOL(nr_running);
 
 /* misc */
 EXPORT_SYMBOL(panic);
diff -Nru linux-2.4.13.vanilla/kernel/sched.c linux-2.4.13.latxs/kernel/sched.c
--- linux-2.4.13.vanilla/kernel/sched.c	Wed Oct 17 14:14:37 2001
+++ linux-2.4.13.latxs/kernel/sched.c	Sun Oct 28 20:00:25 2001
@@ -28,6 +28,8 @@
 #include <linux/kernel_stat.h>
 #include <linux/completion.h>
 #include <linux/prefetch.h>
+#include <linux/slab.h>
+#include <linux/latsched.h>
 
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -74,24 +76,19 @@
  *	Init task must be ok at boot for the ix86 as we will check its signals
  *	via the SMP irq return path.
  */
- 
+
 struct task_struct * init_tasks[NR_CPUS] = {&init_task, };
 
 /*
  * The tasklist_lock protects the linked list of processes.
  *
- * The runqueue_lock locks the parts that actually access
- * and change the run-queues, and have to be interrupt-safe.
- *
- * If both locks are to be concurrently held, the runqueue_lock
+ * If both locks are to be concurrently held, the runqueue_lock(cpu)
  * nests inside the tasklist_lock.
  *
  * task->alloc_lock nests inside tasklist_lock.
  */
-spinlock_t runqueue_lock __cacheline_aligned = SPIN_LOCK_UNLOCKED;  /* inner */
 rwlock_t tasklist_lock __cacheline_aligned = RW_LOCK_UNLOCKED;	/* outer */
 
-static LIST_HEAD(runqueue_head);
 
 /*
  * We align per-CPU scheduling data on cacheline boundaries,
@@ -99,14 +96,52 @@
  */
 static union {
 	struct schedule_data {
+		atomic_t qnr_processes;
+		atomic_t qnr_running;
+		struct list_head proclist_head;
+		struct list_head runqueue_head;
+		spinlock_t runqueue_lock;
 		struct task_struct * curr;
-		cycles_t last_schedule;
+		struct latsched_data ls;
 	} schedule_data;
 	char __pad [SMP_CACHE_BYTES];
-} aligned_data [NR_CPUS] __cacheline_aligned = { {{&init_task,0}}};
+} aligned_data [NR_CPUS] __cacheline_aligned;
+
+#ifdef CONFIG_SMP
 
 #define cpu_curr(cpu) aligned_data[(cpu)].schedule_data.curr
-#define last_schedule(cpu) aligned_data[(cpu)].schedule_data.last_schedule
+#define qnr_processes(cpu) aligned_data[(cpu)].schedule_data.qnr_processes
+#define qnr_running(cpu) aligned_data[(cpu)].schedule_data.qnr_running
+#define proclist_head(cpu) aligned_data[(cpu)].schedule_data.proclist_head
+#define runqueue_head(cpu) aligned_data[(cpu)].schedule_data.runqueue_head
+#define runqueue_lock(cpu) aligned_data[(cpu)].schedule_data.runqueue_lock
+
+
+#define latsched_data(cpu) aligned_data[(cpu)].schedule_data.ls.ls_data
+#define latsched_samp(cpu, idx) aligned_data[(cpu)].schedule_data.ls.ls_data[(idx)]
+#define latsched_size(cpu) aligned_data[(cpu)].schedule_data.ls.ls_size
+#define latsched_curr(cpu) aligned_data[(cpu)].schedule_data.ls.ls_curr
+
+#else	/* #ifdef CONFIG_SMP */
+
+#define cpu_curr(cpu) aligned_data[0].schedule_data.curr
+#define qnr_processes(cpu) aligned_data[0].schedule_data.qnr_processes
+#define qnr_running(cpu) aligned_data[0].schedule_data.qnr_running
+#define proclist_head(cpu) aligned_data[0].schedule_data.proclist_head
+#define runqueue_head(cpu) aligned_data[0].schedule_data.runqueue_head
+#define runqueue_lock(cpu) aligned_data[0].schedule_data.runqueue_lock
+
+
+#define latsched_data(cpu) aligned_data[0].schedule_data.ls.ls_data
+#define latsched_samp(cpu, idx) aligned_data[0].schedule_data.ls.ls_data[(idx)]
+#define latsched_size(cpu) aligned_data[0].schedule_data.ls.ls_size
+#define latsched_curr(cpu) aligned_data[0].schedule_data.ls.ls_curr
+
+#endif	/* #ifdef CONFIG_SMP */
+
+
+static atomic_t lss_enabled = ATOMIC_INIT(0);
+
 
 struct kernel_stat kstat;
 extern struct task_struct *child_reaper;
@@ -124,8 +159,29 @@
 
 #endif
 
+#define rq_lock(p)	lock_task_rq(p)
+#define rq_unlock(p)	spin_unlock(&runqueue_lock(p->processor))
+#define rq_lock_irq(p)	do { local_irq_disable(); lock_task_rq(p); } while (0)
+#define rq_unlock_irq(p)	do { spin_unlock(&runqueue_lock(p->processor)); local_irq_enable(); } while (0)
+#define rq_lock_irqsave(p, f)	do { local_irq_save(f); lock_task_rq(p); } while (0)
+#define rq_unlock_irqrestore(p, f)	do { spin_unlock(&runqueue_lock(p->processor)); local_irq_restore(f); } while (0)
+
+
+
 void scheduling_functions_start_here(void) { }
 
+static inline void lock_task_rq(struct task_struct *p)
+{
+    int cpu = p->processor;
+ 
+    spin_lock(&runqueue_lock(cpu));
+    while (p->processor != cpu) {
+        spin_unlock(&runqueue_lock(cpu));
+        cpu = p->processor;
+        spin_lock(&runqueue_lock(cpu));
+    }
+}
+
 /*
  * This is the function that decides how desirable a process is..
  * You can weigh different processes against each other depending
@@ -140,7 +196,7 @@
  *	 +1000: realtime process, select this.
  */
 
-static inline int goodness(struct task_struct * p, int this_cpu, struct mm_struct *this_mm)
+static inline int goodness(struct task_struct * p, struct mm_struct *this_mm)
 {
 	int weight;
 
@@ -167,13 +223,12 @@
 		weight = p->counter;
 		if (!weight)
 			goto out;
-			
-#ifdef CONFIG_SMP
-		/* Give a largish advantage to the same processor...   */
-		/* (this is equivalent to penalizing other processors) */
-		if (p->processor == this_cpu)
-			weight += PROC_CHANGE_PENALTY;
-#endif
+
+		/* add advantage related to the history of this task on this cpu
+		 * this try to account the cache footprint of p in this_cpu
+		 */
+		if (p->cpu_jtime > jiffies)
+			weight += p->cpu_jtime - jiffies;
 
 		/* .. and a slight advantage to the current MM */
 		if (p->mm == this_mm || !p->mm)
@@ -196,9 +251,9 @@
  * the 'goodness value' of replacing a process on a given CPU.
  * positive value means 'replace', zero or negative means 'dont'.
  */
-static inline int preemption_goodness(struct task_struct * prev, struct task_struct * p, int cpu)
+static inline int preemption_goodness(struct task_struct * prev, struct task_struct * p)
 {
-	return goodness(p, cpu, prev->active_mm) - goodness(prev, cpu, prev->active_mm);
+	return goodness(p, prev->active_mm) - goodness(prev, prev->active_mm);
 }
 
 /*
@@ -211,92 +266,33 @@
 static void reschedule_idle(struct task_struct * p)
 {
 #ifdef CONFIG_SMP
-	int this_cpu = smp_processor_id();
-	struct task_struct *tsk, *target_tsk;
-	int cpu, best_cpu, i, max_prio;
-	cycles_t oldest_idle;
-
-	/*
-	 * shortcut if the woken up task's last CPU is
-	 * idle now.
-	 */
-	best_cpu = p->processor;
-	if (can_schedule(p, best_cpu)) {
-		tsk = idle_task(best_cpu);
-		if (cpu_curr(best_cpu) == tsk) {
-			int need_resched;
-send_now_idle:
-			/*
-			 * If need_resched == -1 then we can skip sending
-			 * the IPI altogether, tsk->need_resched is
-			 * actively watched by the idle thread.
-			 */
-			need_resched = tsk->need_resched;
-			tsk->need_resched = 1;
-			if ((best_cpu != this_cpu) && !need_resched)
-				smp_send_reschedule(best_cpu);
-			return;
-		}
-	}
+	int best_cpu = p->processor, this_cpu = smp_processor_id(), need_resched;
+	struct task_struct *tsk;
 
-	/*
-	 * We know that the preferred CPU has a cache-affine current
-	 * process, lets try to find a new idle CPU for the woken-up
-	 * process. Select the least recently active idle CPU. (that
-	 * one will have the least active cache context.) Also find
-	 * the executing process which has the least priority.
-	 */
-	oldest_idle = (cycles_t) -1;
-	target_tsk = NULL;
-	max_prio = 0;
-
-	for (i = 0; i < smp_num_cpus; i++) {
-		cpu = cpu_logical_map(i);
-		if (!can_schedule(p, cpu))
-			continue;
-		tsk = cpu_curr(cpu);
+	tsk = cpu_curr(best_cpu);
+	if (tsk == idle_task(best_cpu)) {
 		/*
-		 * We use the first available idle CPU. This creates
-		 * a priority list between idle CPUs, but this is not
-		 * a problem.
+		 * If need_resched == -1 then we can skip sending
+		 * the IPI altogether, tsk->need_resched is
+		 * actively watched by the idle thread.
 		 */
-		if (tsk == idle_task(cpu)) {
-			if (last_schedule(cpu) < oldest_idle) {
-				oldest_idle = last_schedule(cpu);
-				target_tsk = tsk;
-			}
-		} else {
-			if (oldest_idle == -1ULL) {
-				int prio = preemption_goodness(tsk, p, cpu);
-
-				if (prio > max_prio) {
-					max_prio = prio;
-					target_tsk = tsk;
-				}
-			}
-		}
-	}
-	tsk = target_tsk;
-	if (tsk) {
-		if (oldest_idle != -1ULL) {
-			best_cpu = tsk->processor;
-			goto send_now_idle;
-		}
+		need_resched = tsk->need_resched;
+		tsk->need_resched = 1;
+		if ((best_cpu != this_cpu) && !need_resched)
+			smp_send_reschedule(best_cpu);
+	} else if (tsk != p && preemption_goodness(tsk, p) > 0) {
 		tsk->need_resched = 1;
 		if (tsk->processor != this_cpu)
 			smp_send_reschedule(tsk->processor);
 	}
-	return;
-		
-
-#else /* UP */
+#else	/* #ifdef CONFIG_SMP */
 	int this_cpu = smp_processor_id();
 	struct task_struct *tsk;
 
 	tsk = cpu_curr(this_cpu);
-	if (preemption_goodness(tsk, p, this_cpu) > 0)
+	if (preemption_goodness(tsk, p) > 0)
 		tsk->need_resched = 1;
-#endif
+#endif	/* #ifdef CONFIG_SMP */
 }
 
 /*
@@ -306,22 +302,172 @@
  * run-queue, not the end. See the comment about "This is
  * subtle" in the scheduler proper..
  */
-static inline void add_to_runqueue(struct task_struct * p)
+static inline void __add_to_runqueue(struct task_struct * p)
+{
+	list_add(&p->run_list, &runqueue_head(p->processor));
+	atomic_inc(&qnr_running(p->processor));
+	atomic_inc(&gnr_running);
+}
+
+static inline void __del_from_runqueue(struct task_struct * p)
+{
+	atomic_dec(&gnr_running);
+	atomic_dec(&qnr_running(p->processor));
+	p->sleep_time = jiffies;
+	list_del(&p->run_list);
+	p->run_list.next = NULL;
+}
+
+void del_from_runqueue(struct task_struct * p)
+{
+	unsigned long flags;
+
+	rq_lock_irqsave(p, flags);
+	__del_from_runqueue(p);
+	rq_unlock_irqrestore(p, flags);
+}
+
+static inline void __add_to_proclist(struct task_struct * p)
+{
+	list_add(&p->proclist_cpu, &proclist_head(p->processor));
+	atomic_inc(&qnr_processes(p->processor));
+}
+
+void add_to_proclist(struct task_struct * p)
+{
+	unsigned long flags;
+
+	rq_lock_irqsave(p, flags);
+	__add_to_proclist(p);
+	rq_unlock_irqrestore(p, flags);
+}
+
+static inline void __del_from_proclist(struct task_struct * p)
 {
-	list_add(&p->run_list, &runqueue_head);
-	nr_running++;
+	list_del(&p->proclist_cpu);
+	atomic_dec(&qnr_processes(p->processor));
+	p->proclist_cpu.next = NULL;
 }
 
-static inline void move_last_runqueue(struct task_struct * p)
+void del_from_proclist(struct task_struct * p)
+{
+	unsigned long flags;
+
+	rq_lock_irqsave(p, flags);
+	__del_from_proclist(p);
+	rq_unlock_irqrestore(p, flags);
+}
+
+void runqueue_spin_lock(struct task_struct * p)
+{
+	rq_lock(p);
+}
+
+void runqueue_spin_unlock(struct task_struct * p)
+{
+	rq_unlock(p);
+}
+
+static inline void __move_last_runqueue(struct task_struct * p)
 {
 	list_del(&p->run_list);
-	list_add_tail(&p->run_list, &runqueue_head);
+	list_add_tail(&p->run_list, &runqueue_head(p->processor));
 }
 
-static inline void move_first_runqueue(struct task_struct * p)
+static inline void __move_first_runqueue(struct task_struct * p)
 {
 	list_del(&p->run_list);
-	list_add(&p->run_list, &runqueue_head);
+	list_add(&p->run_list, &runqueue_head(p->processor));
+}
+
+int move_to_cpu(struct task_struct * p, int cpu, int stick)
+{
+	int res = 0;
+	unsigned long flags;
+
+	rq_lock_irqsave(p, flags);
+	if (p == idle_task(p->processor)) BUG();
+	if (p->processor != cpu) {
+		if (!p->move_to_cpu) {
+			p->move_to_cpu = stick ? -cpu - 1: cpu + 1;
+			res = 1;
+		}
+	} else {
+		if (stick)
+			p->cpus_allowed = (1 << cpu);
+		res = 1;
+	}
+	rq_unlock_irqrestore(p, flags);
+	return res;
+}
+
+/*
+ * try to find the best cpu to run a fresh new process, no locks are held
+ * during this function. it gets called by do_fork() in SMP mode
+ */
+int get_best_cpu(void)
+{
+	int nr, best_cpu, this_cpu = smp_processor_id();
+	int min_nr_running, cpu_running, cpu_processes, min_nr_processes;
+
+	best_cpu = this_cpu;
+	min_nr_running = atomic_read(&qnr_running(this_cpu));
+	min_nr_processes = atomic_read(&qnr_processes(this_cpu));
+	for (nr = 0; nr < smp_num_cpus; nr++) {
+		if (nr == this_cpu) continue;
+		cpu_running = atomic_read(&qnr_running(nr));
+		if (cpu_running < min_nr_running) {
+			min_nr_running = cpu_running;
+			min_nr_processes = atomic_read(&qnr_processes(nr));
+			best_cpu = nr;
+		} else if (cpu_running == min_nr_running &&
+				(cpu_processes = atomic_read(&qnr_processes(nr))) < min_nr_processes) {
+			min_nr_processes = cpu_processes;
+			best_cpu = nr;
+		}
+	}
+	return best_cpu;
+}
+
+static inline int try_steal_task(int src_cpu, int dst_cpu)
+{
+	int res = 0;
+	unsigned long flags;
+	struct task_struct *tsk;
+	struct list_head *head, *tmp;
+
+	spin_lock_irqsave(&runqueue_lock(src_cpu), flags);
+	head = &runqueue_head(src_cpu);
+	list_for_each(tmp, head) {
+		tsk = list_entry(tmp, struct task_struct, run_list);
+		if (can_schedule(tsk, dst_cpu) && !tsk->move_to_cpu) {
+			tsk->move_to_cpu = dst_cpu + 1;
+			res = 1;
+			break;
+		}
+	}
+	spin_unlock_irqrestore(&runqueue_lock(src_cpu), flags);
+	return res;
+}
+
+/*
+ * very basic balancing function that search for the most loaded cpu and
+ * try to steal a process from there, no locks are help during the cpu loop.
+ */
+int runqueue_balance(int mode)
+{
+	int nr, this_cpu = smp_processor_id(), max_nr_running = 0, max_cpu = 0;
+
+	for (nr = 0; nr < smp_num_cpus; nr++) {
+		if (nr == this_cpu) continue;
+		if (atomic_read(&qnr_running(nr)) > max_nr_running) {
+			max_nr_running = atomic_read(&qnr_running(nr));
+			max_cpu = nr;
+		}
+	}
+	if (max_nr_running > (atomic_read(&qnr_running(this_cpu)) + 1))
+		try_steal_task(max_cpu, this_cpu);
+	return 0;
 }
 
 /*
@@ -340,16 +486,16 @@
 	/*
 	 * We want the common case fall through straight, thus the goto.
 	 */
-	spin_lock_irqsave(&runqueue_lock, flags);
+	rq_lock_irqsave(p, flags);
 	p->state = TASK_RUNNING;
-	if (task_on_runqueue(p))
+	if (task_on_runqueue(p) || p->move_to_cpu)
 		goto out;
-	add_to_runqueue(p);
-	if (!synchronous || !(p->cpus_allowed & (1 << smp_processor_id())))
+	__add_to_runqueue(p);
+	if (!synchronous || p->processor != smp_processor_id())
 		reschedule_idle(p);
 	success = 1;
 out:
-	spin_unlock_irqrestore(&runqueue_lock, flags);
+	rq_unlock_irqrestore(p, flags);
 	return success;
 }
 
@@ -382,7 +528,7 @@
  * delivered to the current task. In this case the remaining time
  * in jiffies will be returned, or 0 if the timer expired in time
  *
- * The current task state is guaranteed to be TASK_RUNNING when this 
+ * The current task state is guaranteed to be TASK_RUNNING when this
  * routine returns.
  *
  * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule
@@ -475,41 +621,7 @@
 	task_lock(prev);
 	prev->has_cpu = 0;
 	mb();
-	if (prev->state == TASK_RUNNING)
-		goto needs_resched;
-
-out_unlock:
 	task_unlock(prev);	/* Synchronise here with release_task() if prev is TASK_ZOMBIE */
-	return;
-
-	/*
-	 * Slow path - we 'push' the previous process and
-	 * reschedule_idle() will attempt to find a new
-	 * processor for it. (but it might preempt the
-	 * current process as well.) We must take the runqueue
-	 * lock and re-check prev->state to be correct. It might
-	 * still happen that this process has a preemption
-	 * 'in progress' already - but this is not a problem and
-	 * might happen in other circumstances as well.
-	 */
-needs_resched:
-	{
-		unsigned long flags;
-
-		/*
-		 * Avoid taking the runqueue lock in cases where
-		 * no preemption-check is necessery:
-		 */
-		if ((prev == idle_task(smp_processor_id())) ||
-						(policy & SCHED_YIELD))
-			goto out_unlock;
-
-		spin_lock_irqsave(&runqueue_lock, flags);
-		if ((prev->state == TASK_RUNNING) && !prev->has_cpu)
-			reschedule_idle(prev);
-		spin_unlock_irqrestore(&runqueue_lock, flags);
-		goto out_unlock;
-	}
 #else
 	prev->policy &= ~SCHED_YIELD;
 #endif /* CONFIG_SMP */
@@ -530,15 +642,14 @@
  * tasks can run. It can not be killed, and it cannot sleep. The 'state'
  * information in task[0] is never used.
  */
-asmlinkage void schedule(void)
+static inline void __schedule(void)
 {
 	struct schedule_data * sched_data;
 	struct task_struct *prev, *next, *p;
-	struct list_head *tmp;
+	struct list_head *head, *tmp;
 	int this_cpu, c;
 
-
-	spin_lock_prefetch(&runqueue_lock);
+	spin_lock_prefetch(&runqueue_lock(current->processor));
 
 	if (!current->active_mm) BUG();
 need_resched_back:
@@ -556,7 +667,7 @@
 	 */
 	sched_data = & aligned_data[this_cpu].schedule_data;
 
-	spin_lock_irq(&runqueue_lock);
+	spin_lock_irq(&runqueue_lock(this_cpu));
 
 	/* move an exhausted RR process to be last.. */
 	if (prev->policy == SCHED_RR)
@@ -570,10 +681,17 @@
 				break;
 			}
 		default:
-			del_from_runqueue(prev);
+			__del_from_runqueue(prev);
 		case TASK_RUNNING:;
 	}
 	prev->need_resched = 0;
+	/* we certainly do not want to do this onto the idle task */
+	if (prev != idle_task(this_cpu)) {
+		/* this save the cpu time that has not been consumed by the previous preemption */
+		prev->cpu_jtime = prev->cpu_jtime > prev->sched_jtime ? (prev->cpu_jtime - prev->sched_jtime) >> 1: 0;
+		/* recalculate the cpu time */
+		prev->cpu_jtime += (jiffies - prev->sched_jtime) + jiffies;
+	}
 
 	/*
 	 * this is the scheduler proper:
@@ -589,10 +707,11 @@
 		goto still_running;
 
 still_running_back:
-	list_for_each(tmp, &runqueue_head) {
+	head = &runqueue_head(this_cpu);
+	list_for_each(tmp, head) {
 		p = list_entry(tmp, struct task_struct, run_list);
 		if (can_schedule(p, this_cpu)) {
-			int weight = goodness(p, this_cpu, prev->active_mm);
+			int weight = goodness(p, prev->active_mm);
 			if (weight > c)
 				c = weight, next = p;
 		}
@@ -601,6 +720,12 @@
 	/* Do we need to re-calculate counters? */
 	if (!c)
 		goto recalculate;
+
+#ifdef CONFIG_SMP
+	if (next->move_to_cpu)
+		goto cpu_migrate;
+cpu_migrate_back:
+#endif	/* #ifdef CONFIG_SMP */
 	/*
 	 * from this point on nothing can prevent us from
 	 * switching to the next task, save this fact in
@@ -609,9 +734,9 @@
 	sched_data->curr = next;
 #ifdef CONFIG_SMP
  	next->has_cpu = 1;
-	next->processor = this_cpu;
-#endif
-	spin_unlock_irq(&runqueue_lock);
+#endif	/* #ifdef CONFIG_SMP */
+	next->sched_jtime = jiffies;
+	spin_unlock_irq(&runqueue_lock(this_cpu));
 
 	if (prev == next) {
 		/* We won't go through the normal tail, so do this by hand */
@@ -619,24 +744,6 @@
 		goto same_process;
 	}
 
-#ifdef CONFIG_SMP
- 	/*
- 	 * maintain the per-process 'last schedule' value.
- 	 * (this has to be recalculated even if we reschedule to
- 	 * the same process) Currently this is only used on SMP,
-	 * and it's approximate, so we do not have to maintain
-	 * it while holding the runqueue spinlock.
- 	 */
- 	sched_data->last_schedule = get_cycles();
-
-	/*
-	 * We drop the scheduler lock early (it's a global spinlock),
-	 * thus we have to lock the previous process from getting
-	 * rescheduled during switch_to().
-	 */
-
-#endif /* CONFIG_SMP */
-
 	kstat.context_swtch++;
 	/*
 	 * there are 3 processes which are affected by a context switch:
@@ -683,30 +790,71 @@
 
 recalculate:
 	{
-		struct task_struct *p;
-		spin_unlock_irq(&runqueue_lock);
+		spin_unlock_irq(&runqueue_lock(this_cpu));
 		read_lock(&tasklist_lock);
-		for_each_task(p)
+		head = &proclist_head(this_cpu);
+		list_for_each(tmp, head) {
+			p = list_entry(tmp, struct task_struct, proclist_cpu);
 			p->counter = (p->counter >> 1) + NICE_TO_TICKS(p->nice);
+		}
 		read_unlock(&tasklist_lock);
-		spin_lock_irq(&runqueue_lock);
+		spin_lock_irq(&runqueue_lock(this_cpu));
 	}
 	goto repeat_schedule;
 
 still_running:
 	if (!(prev->cpus_allowed & (1UL << this_cpu)))
 		goto still_running_back;
-	c = goodness(prev, this_cpu, prev->active_mm);
+	c = goodness(prev, prev->active_mm);
 	next = prev;
 	goto still_running_back;
 
 move_rr_last:
 	if (!prev->counter) {
 		prev->counter = NICE_TO_TICKS(prev->nice);
-		move_last_runqueue(prev);
+		__move_last_runqueue(prev);
 	}
 	goto move_rr_back;
 
+#ifdef CONFIG_SMP
+cpu_migrate:
+	{
+		int move_cpu, next_cpu, stick;
+
+		if (next == prev) {
+			next = idle_task(this_cpu);
+			next->need_resched = 1;
+			goto cpu_migrate_back;
+		}
+		if (next->move_to_cpu > 0)
+			move_cpu = next->move_to_cpu - 1, stick = 0;
+		else
+			move_cpu = -next->move_to_cpu - 1, stick = 1;
+		__del_from_runqueue(next);
+		spin_unlock_irq(&runqueue_lock(this_cpu));
+
+		write_lock_irq(&tasklist_lock);
+		lock_task_rq(next);
+		__del_from_proclist(next);
+		next_cpu = next->processor;
+		next->processor = move_cpu;
+		if (stick)
+			next->cpus_allowed = (1 << move_cpu);
+		spin_unlock(&runqueue_lock(next_cpu));
+
+		spin_lock(&runqueue_lock(move_cpu));
+		__add_to_proclist(next);
+		__add_to_runqueue(next);
+		next->move_to_cpu = 0;
+		reschedule_idle(next);
+		spin_unlock(&runqueue_lock(move_cpu));
+		write_unlock_irq(&tasklist_lock);
+
+		spin_lock_irq(&runqueue_lock(this_cpu));
+	}
+	goto repeat_schedule;
+#endif	/* #ifdef CONFIG_SMP */
+
 scheduling_in_interrupt:
 	printk("Scheduling in interrupt\n");
 	BUG();
@@ -730,7 +878,7 @@
 
 	CHECK_MAGIC_WQHEAD(q);
 	WQ_CHECK_LIST_HEAD(&q->task_list);
-	
+
 	list_for_each(tmp,&q->task_list) {
 		unsigned int state;
                 wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);
@@ -849,7 +997,7 @@
 long sleep_on_timeout(wait_queue_head_t *q, long timeout)
 {
 	SLEEP_ON_VAR
-	
+
 	current->state = TASK_UNINTERRUPTIBLE;
 
 	SLEEP_ON_HEAD
@@ -907,7 +1055,7 @@
 	return tsk;
 }
 
-static int setscheduler(pid_t pid, int policy, 
+static int setscheduler(pid_t pid, int policy,
 			struct sched_param *param)
 {
 	struct sched_param lp;
@@ -926,14 +1074,14 @@
 	 * We play safe to avoid deadlocks.
 	 */
 	read_lock_irq(&tasklist_lock);
-	spin_lock(&runqueue_lock);
 
 	p = find_process_by_pid(pid);
 
 	retval = -ESRCH;
 	if (!p)
-		goto out_unlock;
-			
+		goto out_unlock_tkll;
+
+	rq_lock(p);
 	if (policy < 0)
 		policy = p->policy;
 	else {
@@ -942,7 +1090,7 @@
 				policy != SCHED_OTHER)
 			goto out_unlock;
 	}
-	
+
 	/*
 	 * Valid priorities for SCHED_FIFO and SCHED_RR are 1..99, valid
 	 * priority for SCHED_OTHER is 0.
@@ -954,7 +1102,7 @@
 		goto out_unlock;
 
 	retval = -EPERM;
-	if ((policy == SCHED_FIFO || policy == SCHED_RR) && 
+	if ((policy == SCHED_FIFO || policy == SCHED_RR) &&
 	    !capable(CAP_SYS_NICE))
 		goto out_unlock;
 	if ((current->euid != p->euid) && (current->euid != p->uid) &&
@@ -965,19 +1113,20 @@
 	p->policy = policy;
 	p->rt_priority = lp.sched_priority;
 	if (task_on_runqueue(p))
-		move_first_runqueue(p);
+		__move_first_runqueue(p);
 
 	current->need_resched = 1;
 
 out_unlock:
-	spin_unlock(&runqueue_lock);
+	rq_unlock(p);
+out_unlock_tkll:
 	read_unlock_irq(&tasklist_lock);
 
 out_nounlock:
 	return retval;
 }
 
-asmlinkage long sys_sched_setscheduler(pid_t pid, int policy, 
+asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
 				      struct sched_param *param)
 {
 	return setscheduler(pid, policy, param);
@@ -1042,29 +1191,13 @@
 asmlinkage long sys_sched_yield(void)
 {
 	/*
-	 * Trick. sched_yield() first counts the number of truly 
+	 * Trick. sched_yield() first counts the number of truly
 	 * 'pending' runnable processes, then returns if it's
 	 * only the current processes. (This test does not have
 	 * to be atomic.) In threaded applications this optimization
 	 * gets triggered quite often.
 	 */
-
-	int nr_pending = nr_running;
-
-#if CONFIG_SMP
-	int i;
-
-	// Subtract non-idle processes running on other CPUs.
-	for (i = 0; i < smp_num_cpus; i++) {
-		int cpu = cpu_logical_map(i);
-		if (aligned_data[cpu].schedule_data.curr != idle_task(cpu))
-			nr_pending--;
-	}
-#else
-	// on UP this process is on the runqueue as well
-	nr_pending--;
-#endif
-	if (nr_pending) {
+	if (atomic_read(&qnr_running(current->processor)) > 1) {
 		/*
 		 * This process can only be rescheduled by us,
 		 * so this is safe without any locking.
@@ -1259,7 +1392,7 @@
 
 	/* We also take the runqueue_lock while altering task fields
 	 * which affect scheduling decisions */
-	spin_lock(&runqueue_lock);
+	rq_lock(this_task);
 
 	this_task->ptrace = 0;
 	this_task->nice = DEF_NICE;
@@ -1274,7 +1407,7 @@
 	memcpy(this_task->rlim, init_task.rlim, sizeof(*(this_task->rlim)));
 	this_task->user = INIT_USER;
 
-	spin_unlock(&runqueue_lock);
+	rq_unlock(this_task);
 	write_unlock_irq(&tasklist_lock);
 }
 
@@ -1320,10 +1453,11 @@
 	if (current != &init_task && task_on_runqueue(current)) {
 		printk("UGH! (%d:%d) was on the runqueue, removing.\n",
 			smp_processor_id(), current->pid);
-		del_from_runqueue(current);
+		__del_from_runqueue(current);
 	}
+	current->cpu_jtime = 0;
+	current->sched_jtime = jiffies;
 	sched_data->curr = current;
-	sched_data->last_schedule = get_cycles();
 	clear_bit(current->processor, &wait_init_idle);
 }
 
@@ -1335,8 +1469,16 @@
 	 * We have to do a little magic to get the first
 	 * process right in SMP mode.
 	 */
-	int cpu = smp_processor_id();
-	int nr;
+	int nr, cpu = smp_processor_id();
+
+	for (nr = 0; nr < NR_CPUS; nr++) {
+		atomic_set(&qnr_processes(nr), 0);
+		atomic_set(&qnr_running(nr), 0);
+		cpu_curr(nr) = &init_task;
+		INIT_LIST_HEAD(&runqueue_head(nr));
+		INIT_LIST_HEAD(&proclist_head(nr));
+		runqueue_lock(nr) = SPIN_LOCK_UNLOCKED;
+	}
 
 	init_task.processor = cpu;
 
@@ -1355,3 +1497,143 @@
 	atomic_inc(&init_mm.mm_count);
 	enter_lazy_tlb(&init_mm, current, cpu);
 }
+
+
+void __init latsched_init(void)
+{
+	int ii, size;
+
+	size = STD_LATSCHED_SAMPLES;
+	for (ii = 0; ii < smp_num_cpus; ii++) {
+		if ((latsched_data(ii) = kmalloc(size * sizeof(struct latsched_sample), GFP_KERNEL)))
+			memset(latsched_data(ii), 0, size * sizeof(struct latsched_sample));
+		latsched_size(ii) = size;
+		latsched_curr(ii) = 0;
+	}
+}
+
+
+asmlinkage void schedule(void)
+{
+	int this_cpu;
+	unsigned long flags;
+	cycles_t cycls;
+
+	if (atomic_read(&lss_enabled)) {
+		local_irq_save(flags);
+		this_cpu = current->processor;
+		latsched_samp(this_cpu, latsched_curr(this_cpu)).lss_pid = -1;
+		latsched_samp(this_cpu, latsched_curr(this_cpu)).lss_in = get_cycles();
+		local_irq_restore(flags);
+	}
+
+	__schedule();
+
+	cycls = get_cycles();
+	if (atomic_read(&lss_enabled)) {
+		local_irq_save(flags);
+		this_cpu = current->processor;
+		if (latsched_samp(this_cpu, latsched_curr(this_cpu)).lss_pid == -1) {
+			latsched_samp(this_cpu, latsched_curr(this_cpu)).lss_out = cycls;
+			latsched_samp(this_cpu, latsched_curr(this_cpu)).lss_pid = current->pid;
+			if (++latsched_curr(this_cpu) >= latsched_size(this_cpu))
+				latsched_curr(this_cpu) = 0;
+		}
+		local_irq_restore(flags);
+	}
+}
+
+
+int latsched_start(int on)
+{
+	int res;
+
+	cli();
+	if (on) {
+		if (!atomic_read(&lss_enabled)) {
+			int ii;
+
+			for (ii = 0; ii < smp_num_cpus; ii++) {
+				res = -ENOMEM;
+				if (!latsched_data(ii) &&
+						!(latsched_data(ii) = kmalloc(latsched_size(ii) * sizeof(struct latsched_sample), GFP_KERNEL)))
+					goto out;
+				memset(latsched_data(ii), 0, latsched_size(ii) * sizeof(struct latsched_sample));
+				latsched_curr(ii) = 0;
+			}
+			atomic_set(&lss_enabled, 1);
+		}
+	} else
+		atomic_set(&lss_enabled, 0);
+	res = 0;
+out:
+	sti();
+	return res;
+}
+
+
+int latsched_setsamples(int nsamps)
+{
+	int ii, res, size = nsamps;
+
+	cli();
+	res = -EBUSY;
+	if (atomic_read(&lss_enabled))
+		goto out;
+	for (ii = 0; ii < smp_num_cpus; ii++) {
+		if (latsched_data(ii))
+			kfree(latsched_data(ii));
+		res = -ENOMEM;
+		if (!(latsched_data(ii) = kmalloc(size * sizeof(struct latsched_sample), GFP_KERNEL)))
+			goto out;
+		memset(latsched_data(ii), 0, size * sizeof(struct latsched_sample));
+		latsched_size(ii) = size;
+		latsched_curr(ii) = 0;
+	}
+	res = 0;
+out:
+	sti();
+	return res;
+}
+
+
+int latsched_getdata(struct lsctl_getdata *lsgd)
+{
+	int res;
+
+	cli();
+	res = -EBUSY;
+	if (atomic_read(&lss_enabled))
+		goto out;
+	res = -EINVAL;
+	if (lsgd->cpu < 0 || lsgd->cpu >= smp_num_cpus)
+		goto out;
+	if (latsched_samp(lsgd->cpu, latsched_size(lsgd->cpu) - 1).lss_pid != 0) {
+		int size, csize;
+		struct latsched_sample *data = lsgd->data;
+
+		lsgd->rsize = size = latsched_size(lsgd->cpu);
+		if (lsgd->rsize > lsgd->size)
+			lsgd->rsize = size = lsgd->size;
+		csize = latsched_size(lsgd->cpu) - latsched_curr(lsgd->cpu);
+		if (csize > size)
+			csize = size;
+		if (csize)
+			__copy_to_user(data, &latsched_samp(lsgd->cpu, latsched_curr(lsgd->cpu)),
+					csize * sizeof(struct latsched_sample));
+		data += csize;
+		size -= csize;
+		if (size)
+			__copy_to_user(data, &latsched_samp(lsgd->cpu, 0),
+					size * sizeof(struct latsched_sample));
+	} else {
+		lsgd->rsize = latsched_curr(lsgd->cpu);
+		__copy_to_user(lsgd->data, &latsched_samp(lsgd->cpu, 0),
+				lsgd->rsize * sizeof(struct latsched_sample));
+	}
+	res = 0;
+out:
+	sti();
+	return res;
+}
+
diff -Nru linux-2.4.13.vanilla/kernel/signal.c linux-2.4.13.latxs/kernel/signal.c
--- linux-2.4.13.vanilla/kernel/signal.c	Mon Sep 17 16:40:01 2001
+++ linux-2.4.13.latxs/kernel/signal.c	Thu Oct 25 11:34:51 2001
@@ -478,10 +478,10 @@
 	 * process of changing - but no harm is done by that
 	 * other than doing an extra (lightweight) IPI interrupt.
 	 */
-	spin_lock(&runqueue_lock);
+	runqueue_spin_lock(t);
 	if (t->has_cpu && t->processor != smp_processor_id())
 		smp_send_reschedule(t->processor);
-	spin_unlock(&runqueue_lock);
+	runqueue_spin_unlock(t);
 #endif /* CONFIG_SMP */
 
 	if (t->state & TASK_INTERRUPTIBLE) {
diff -Nru linux-2.4.13.vanilla/kernel/softirq.c linux-2.4.13.latxs/kernel/softirq.c
--- linux-2.4.13.vanilla/kernel/softirq.c	Sat Sep  8 12:02:32 2001
+++ linux-2.4.13.latxs/kernel/softirq.c	Thu Oct 25 11:34:51 2001
@@ -369,7 +369,7 @@
 	sigfillset(&current->blocked);
 
 	/* Migrate to the right CPU */
-	current->cpus_allowed = 1UL << cpu;
+	if (!move_to_cpu(current, cpu, 1)) BUG();
 	while (smp_processor_id() != cpu)
 		schedule();
 

