A colleague's Linux machine was exhibiting a very high load value, for no obvious reason. I wanted to make him point the kernel debugger on the routine calculating the load. It has been more than 7 years since the last time I worked on a Linux kernel, so I had to find my way around from first principles. This is an annotated and slightly edited version of what I did.
[dds]$ cd /usr/src [src]$ ls debug linux-2.4 linux-2.4.20-8 redhat [src]$ cd linux-2.4 [linux-2.4]$ ls arch Documentation kernel README configs drivers lib REPORTING-BUGS COPYING fs MAINTAINERS Rules.make COPYING.modules include Makefile scripts CREDITS init mm tmp_include_depends crypto ipc net [linux-2.4]$ cd kernel/ [kernel]$ ls acct.c exit.c kksymoops.c panic.c resource.c time.c capability.c fork.c kmod.c pid.c sched.c timer.c context.c futex.c ksyms.c pm.c signal.c uid16.c cpufreq.c info.c lowlat.c printk.c softirq.c user.c dma.c itimer.c Makefile profile.c sys.c exec_domain.c kallsyms.c module.c ptrace.c sysctl.c
[kernel]$ grep 'load =' * sched.c: max_load = 1; cat sched.c sched.c: load = rq_src->nr_running; sched.c: load = this_rq->prev_nr_running[i]; sched.c: max_load = load;
[kernel]$ vi sched.c
/*
* kernel/sched.c
*
* Kernel scheduler and related syscalls
*
* Copyright (C) 1991-2002 Linus Torvalds
*
[...]
*/
/*
* find_busiest_queue - find the busiest runqueue.
*/
static inline runqueue_t *find_busiest_queue(runqueue_t *this_rq, int this_cpu, int idle, int *imbalance)
{
int nr_running, load, max_load, i;
runqueue_t *busiest, *rq_src;
[...]
if (idle || (this_rq->nr_running > this_rq->prev_nr_running[this_cpu]))
nr_running = this_rq->nr_running;
else
nr_running = this_rq->prev_nr_running[this_cpu];
busiest = NULL;
max_load = 1;
for (i = 0; i < NR_CPUS; i++) {
if (!cpu_online(i))
continue;
rq_src = cpu_rq(i);
if (idle || (rq_src->nr_running < this_rq->prev_nr_running[i]))
load = rq_src->nr_running;
else
load = this_rq->prev_nr_running[i];
this_rq->prev_nr_running[i] = rq_src->nr_running;
[kernel]$ ls /proc 1 1230 16428 21485 2314 2742 8 hpc mtrr 10 1231 16430 22 2315 2743 82 ide net 1003 1232 16432 22136 24 27719 9 interrupts partitions 1013 1233 18 22160 254 3 954 iomem pci 1027 1235 18602 22161 255 4 968 ioports scsi 1038 1236 18603 22162 256 5 994 irq self 1093 12599 19 22163 257 5023 bus kcore slabinfo 1170 12600 2 23 258 5024 cmdline kmsg stat 1188 12601 20 2306 2734 6 cpuinfo ksyms swaps 1197 12602 21 2307 2735 7 devices loadavg sys 1207 12603 21204 2308 2736 706 dma locks sysvipc 1209 12604 21206 2309 2737 710 driver mdstat tty 1212 12605 21217 2310 2738 7153 execdomains meminfo uptime 1214 12606 21449 2311 2739 7154 fb misc version 1228 12607 21450 2312 2740 720 filesystems modules 1229 16 21451 2313 2741 756 fs mounts [kernel]$ cat /proc/loadavg 0.02 0.04 0.00 1/102 21486
[kernel]$ grep loadavg * [kernel]$
[kernel]$ cd .. [linux-2.4]$ find . -name '*.c' | xargs grep loadavg ./drivers/char/tpqic02.c: /* Wait for ready or exception, without driving the loadavg up too much. ./drivers/net/wan/comx.c:static void comx_loadavg_timerfun(unsigned long d) [...] ./drivers/net/wan/comx.c: del_timer(&ch->loadavg_timer); ./fs/proc/proc_misc.c:static int loadavg_read_proc(char *page, char **start, off_t off, ./fs/proc/proc_misc.c: {"loadavg", loadavg_read_proc},
[linux-2.4]$ cd fs/proc [proc]$ vi proc_misc.c
/*
* linux/fs/proc/proc_misc.c
*
* linux/fs/proc/array.c
* Copyright (C) 1992 by Linus Torvalds
* based on ideas by Darren Senn
[...]
static int loadavg_read_proc(char *page, char **start, off_t off,
int count, int *eof, void *data)
{
int a, b, c;
int len;
a = avenrun[0] + (FIXED_1/200);
b = avenrun[1] + (FIXED_1/200);
c = avenrun[2] + (FIXED_1/200);
len = sprintf(page,"%d.%02d %d.%02d %d.%02d %ld/%d %d\n",
LOAD_INT(a), LOAD_FRAC(a),
LOAD_INT(b), LOAD_FRAC(b),
LOAD_INT(c), LOAD_FRAC(c),
nr_running(), nr_threads, last_pid);
return proc_calc_metrics(page, start, off, count, eof, len);
}
[proc]$ cd ../../kernel [kernel]$ grep avenrun *.c info.c: val.loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT); info.c: val.loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT); info.c: val.loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT); timer.c: * imply that avenrun[] is the standard name for this kind of thing. timer.c:unsigned long avenrun[3]; timer.c: CALC_LOAD(avenrun[0], EXP_1, active_tasks); timer.c: CALC_LOAD(avenrun[1], EXP_5, active_tasks); timer.c: CALC_LOAD(avenrun[2], EXP_15, active_tasks);Two candidates.
[kernel]$ vi info.c timer.c
/*
* linux/kernel/info.c
*
* Copyright (C) 1992 Darren Senn
*/
[...]
asmlinkage long sys_sysinfo(struct sysinfo *info)
{
struct sysinfo val;
memset((char *)&val, 0, sizeof(struct sysinfo));
cli();
val.uptime = jiffies / HZ;
val.loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT);
val.loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT);
val.loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT);
/*
* linux/kernel/timer.c
*
* Kernel internal timers, kernel timekeeping, basic process system calls
*
* Copyright (C) 1991, 1992 Linus Torvalds
[..]
*/
/*
* Hmm.. Changed this, as the GNU make sources (load.c) seems to
* imply that avenrun[] is the standard name for this kind of thing.
* Nothing else seems to be standardized: the fractional size etc
* all seem to differ on different machines.
*/
unsigned long avenrun[3];
static inline void calc_load(unsigned long ticks)
{
unsigned long active_tasks; /* fixed-point */
static int count = LOAD_FREQ;
count -= ticks;
if (count < 0) {
count += LOAD_FREQ;
active_tasks = count_active_tasks();
CALC_LOAD(avenrun[0], EXP_1, active_tasks);
CALC_LOAD(avenrun[1], EXP_5, active_tasks);
CALC_LOAD(avenrun[2], EXP_15, active_tasks);
}
}
static unsigned long count_active_tasks(void)
{
return (nr_running() + nr_uninterruptible()) * FIXED_1;
}
Last modified: Thursday, November 25, 2004 9:40 am
Unless otherwise expressly stated, all original material on this page created by Diomidis Spinellis is licensed under a Creative Commons Attribution-NonCommercial 4.0 International License.