简要说明

一个linux内核小白的学习路,看书的过程有很多暂时无法理解的我就直接略过或者简单笔记一下。由于我主要的目的是学习android内核,所以我主要参考代码是安卓内核的源码。有些我看书理解的如果有错误,还请多多指点。

参考书籍:Linux内核深度解析《余华兵》

内核引导和初始化

当CPU通电后,首先是执行引导程序,引导程序存放在一个只读的存储器中,物理地址0开始的一段地址空间分配给了这个存储器,引导程序把内核加载到内存中,然后执行内核,内核初始化完成后,启动用户空间第一个进程。

根据书中提示引导程序的入口是arch/arm/cpu/armv8/start.S

而我参考的是android内核,翻来翻去都没找到,所以我跳过了uboot引导程序加载的部分。

直接快进到arch/arm64/kernel/head.S,这里对比发现和android内核的差不多了。

首先看开始的相关代码

1
2
3
4
5
6
7
8
9
10
#ifdef CONFIG_EFI
efi_head:
/*
* This add instruction has no meaningful effect except that
* its opcode forms the magic "MZ" signature required by UEFI.
*/
add x13, x18, #0x16
b stext
#else
b stext // branch to kernel start, magic

CONFIG_EFI表示提供UEFI运行时支持。

上面可以看到跳转到了stext。下面继续看stext的代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
ENTRY(stext)
bl preserve_boot_args // 把引导程序传的4个参数保存在全局数组boot_args中
bl el2_setup // Drop to EL1, w20=cpu_boot_mode
adrp x24, __PHYS_OFFSET
bl set_cpu_boot_mode_flag

bl __vet_fdt
bl __create_page_tables // 创建页表映射 x25=TTBR0, x26=TTBR1
/*
* The following calls CPU setup code, see arch/arm64/mm/proc.S for
* details.
* On return, the CPU will be ready for the MMU to be turned on and
* the TCR will have been set.
*/
ldr x27, =__mmap_switched // address to jump to after
// MMU has been enabled
adr_l lr, __enable_mmu // return (PIC) address
b __cpu_setup // 初始化处理器 initialise processor
ENDPROC(stext)

发现虽然和书中的代码相差不大,但是还是有一定差别,这里并没有看到书中的__primary_switch。

然后我仔细翻了下代码。然后重点看下面几句

1
2
3
4
5
ldr    x27, =__mmap_switched        // 将__mmap_switched的地址保存在x27寄存器中
adr_l lr, __enable_mmu // 这里应该是将__enable_mmu这个函数的符号地址存到lr中
b __cpu_setup //初始化处理器
// 虽然这里是b指令直接跳转,但是他手动设置了lr
// 所以这里__cpu_setup执行完成后,应该接着就执行__enable_mmu

所以接下来看看__enable_mmu函数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
__enable_mmu:
ldr x5, =vectors
msr vbar_el1, x5
msr ttbr0_el1, x25 // load TTBR0
msr ttbr1_el1, x26 // load TTBR1
isb
msr sctlr_el1, x0
isb
/*
* Invalidate the local I-cache so that any instructions fetched
* speculatively from the PoC are discarded, since they may have
* been dynamically patched at the PoU.
*/
ic iallu
dsb nsh
isb
br x27
ENDPROC(__enable_mmu)

__enable_mmu书中是说这里是开启内存管理单元,以后执行程序时内存管理单元会将虚拟地址转换成物理地址,由于我比较菜,所以暂时不理会函数内部的如何实现,先关注执行的走向,最后看到br指令跳转到x27的对应函数,前面能看到x27实际上是__mmap_switched函数的地址,所以继续看这个函数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
__mmap_switched:
// Clear BSS
adr_l x0, __bss_start
mov x1, xzr
adr_l x2, __bss_stop
sub x2, x2, x0
bl __pi_memset
dsb ishst // Make zero page visible to PTW

adr_l sp, initial_sp, x4
mov x4, sp
and x4, x4, #~(THREAD_SIZE - 1)
#ifdef CONFIG_THREAD_INFO_IN_TASK
adr_l x5, init_task
msr sp_el0, x5 // Save thread_info
#else
msr sp_el0, x4 // Save thread_info
#endif
str_l x21, __fdt_pointer, x5 // Save FDT pointer
str_l x24, memstart_addr, x6 // Save PHYS_OFFSET
mov x29, #0
#ifdef CONFIG_KASAN
bl kasan_early_init
#endif
b start_kernel
ENDPROC(__mmap_switched)

走到这里。我们就看到最后调用了内核初始化的c语言部分的入口函数start_kernel

书中没有看到具体是哪个文件下。我自己找了下,路径是./init/main.c中。下面贴上完整代码。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
asmlinkage __visible void __init start_kernel(void)
{
char *command_line;
char *after_dashes;

/*
* Need to run as early as possible, to initialize the
* lockdep hash:
*/
lockdep_init();
set_task_stack_end_magic(&init_task);
smp_setup_processor_id();
debug_objects_early_init();

cgroup_init_early();

local_irq_disable();
early_boot_irqs_disabled = true;

/*
* Interrupts are still disabled. Do necessary setups, then
* enable them
*/
boot_cpu_init();
page_address_init();
pr_notice("%s", linux_banner);
setup_arch(&command_line);
/*
* Set up the the initial canary ASAP:
*/
boot_init_stack_canary();
mm_init_cpumask(&init_mm);
setup_command_line(command_line);
setup_nr_cpu_ids();
setup_per_cpu_areas();
smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */

build_all_zonelists(NULL, NULL);
page_alloc_init();

pr_notice("Kernel command line: %s\n", boot_command_line);
parse_early_param();
after_dashes = parse_args("Booting kernel",
static_command_line, __start___param,
__stop___param - __start___param,
-1, -1, &unknown_bootoption);
if (!IS_ERR_OR_NULL(after_dashes))
parse_args("Setting init args", after_dashes, NULL, 0, -1, -1,
set_init_arg);

jump_label_init();

/*
* These use large bootmem allocations and must precede
* kmem_cache_init()
*/
setup_log_buf(0);
pidhash_init();
vfs_caches_init_early();
sort_main_extable();
trap_init();
mm_init();

/*
* Set up the scheduler prior starting any interrupts (such as the
* timer interrupt). Full topology setup happens at smp_init()
* time - but meanwhile we still have a functioning scheduler.
*/
sched_init();
/*
* Disable preemption - early bootup scheduling is extremely
* fragile until we cpu_idle() for the first time.
*/
preempt_disable();
if (WARN(!irqs_disabled(),
"Interrupts were enabled *very* early, fixing it\n"))
local_irq_disable();
idr_init_cache();
rcu_init();

/* trace_printk() and trace points may be used after this */
trace_init();

context_tracking_init();
radix_tree_init();
/* init some links before init_ISA_irqs() */
early_irq_init();
init_IRQ();
tick_init();
rcu_init_nohz();
init_timers();
hrtimers_init();
softirq_init();
timekeeping_init();
time_init();
sched_clock_postinit();
perf_event_init();
profile_init();
call_function_init();
WARN(!irqs_disabled(), "Interrupts were enabled early\n");
early_boot_irqs_disabled = false;
local_irq_enable();

kmem_cache_init_late();

/*
* HACK ALERT! This is early. We're enabling the console before
* we've done PCI setups etc, and console_init() must be aware of
* this. But we do want output early, in case something goes wrong.
*/
console_init();
if (panic_later)
panic("Too many boot %s vars at `%s'", panic_later,
panic_param);

lockdep_info();

/*
* Need to run this when irqs are enabled, because it wants
* to self-test [hard/soft]-irqs on/off lock inversion bugs
* too:
*/
locking_selftest();

#ifdef CONFIG_BLK_DEV_INITRD
if (initrd_start && !initrd_below_start_ok &&
page_to_pfn(virt_to_page((void *)initrd_start)) < min_low_pfn) {
pr_crit("initrd overwritten (0x%08lx < 0x%08lx) - disabling it.\n",
page_to_pfn(virt_to_page((void *)initrd_start)),
min_low_pfn);
initrd_start = 0;
}
#endif
page_cgroup_init();
debug_objects_mem_init();
kmemleak_init();
setup_per_cpu_pageset();
numa_policy_init();
if (late_time_init)
late_time_init();
sched_clock_init();
calibrate_delay();
pidmap_init();
anon_vma_init();
acpi_early_init();
#ifdef CONFIG_X86
if (efi_enabled(EFI_RUNTIME_SERVICES))
efi_enter_virtual_mode();
#endif
#ifdef CONFIG_X86_ESPFIX64
/* Should be run before the first non-init thread is created */
init_espfix_bsp();
#endif
thread_stack_cache_init();
cred_init();
fork_init(totalram_pages);
proc_caches_init();
buffer_init();
key_init();
security_init();
dbg_late_init();
vfs_caches_init(totalram_pages);
signals_init();
/* rootfs populating might need page-writeback */
page_writeback_init();
proc_root_init();
cgroup_init();
cpuset_init();
taskstats_init_early();
delayacct_init();

check_bugs();

acpi_subsystem_init();
sfi_init_late();

if (efi_enabled(EFI_RUNTIME_SERVICES)) {
efi_late_init();
efi_free_boot_services();
}

ftrace_init();

/* Do the rest non-__init'ed, we're now alive */
rest_init();
}

可以看到上面的代码基本都是init,初始化各类的子系统。到最后调用了rest_init函数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
static noinline void __init_refok rest_init(void)
{
int pid;

rcu_scheduler_starting();
smpboot_thread_init();
/*
* We need to spawn init first so that it obtains pid 1, however
* the init task will end up wanting to create kthreads, which, if
* we schedule it before we create kthreadd, will OOPS.
*/
kernel_thread(kernel_init, NULL, CLONE_FS);
numa_default_policy();
pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES);
rcu_read_lock();
kthreadd_task = find_task_by_pid_ns(pid, &init_pid_ns);
rcu_read_unlock();
complete(&kthreadd_done);

/*
* The boot idle thread must execute schedule()
* at least once to get things moving:
*/
init_idle_bootup_task(current);
schedule_preempt_disabled();
/* Call into cpu_idle with preempt disabled */
cpu_startup_entry(CPUHP_ONLINE);
}

重点看书中所说的init线程和创建内核的线程,也就是下面的代码

1
2
kernel_thread(kernel_init, NULL, CLONE_FS);
pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES);

所以我们接着看kernel_init函数的实现

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
static int __ref kernel_init(void *unused)
{
int ret;

kernel_init_freeable();
/* need to finish all async __init code before freeing the memory */
async_synchronize_full();
free_initmem();
mark_readonly();
system_state = SYSTEM_RUNNING;
numa_default_policy();

flush_delayed_fput();

if (ramdisk_execute_command) {
ret = run_init_process(ramdisk_execute_command);
if (!ret)
return 0;
pr_err("Failed to execute %s (error %d)\n",
ramdisk_execute_command, ret);
}

/*
* We try each of these until one succeeds.
*
* The Bourne shell can be used instead of init if we are
* trying to recover a really broken machine.
*/
if (execute_command) {
ret = run_init_process(execute_command);
if (!ret)
return 0;
pr_err("Failed to execute %s (error %d). Attempting defaults...\n",
execute_command, ret);
}
if (!try_to_run_init_process("/sbin/init") ||
!try_to_run_init_process("/etc/init") ||
!try_to_run_init_process("/bin/init") ||
!try_to_run_init_process("/bin/sh"))
return 0;

panic("No working init found. Try passing init= option to kernel. "
"See Linux Documentation/init.txt for guidance.");
}

最后我在kernel_init_freeable函数中看到了书中所说的smp系统初始化相关的代码。这里就不继续往后贴了

SMP系统就是对称多处理器系统。并且每个处理器的地位平等。但是在启动过程中不是平等的。0号处理器是引导处理器,负责执行引导程序和初始化内存,其他处理器是从处理器,等待引导处理器完成初始化,引导处理器初始化完内核后启动从处理器。启动流程看起来比较复杂。暂时略过。

init进程是用户空间第一个进程,负责启动用户程序,我们可以看到上面start_kernel函数中是使用try_to_run_init_process函数来启动init进程的。

1
2
3
4
5
6
//尝试启动/sbin/init失败则启动/etc/init以此类推
if (!try_to_run_init_process("/sbin/init") ||
!try_to_run_init_process("/etc/init") ||
!try_to_run_init_process("/bin/init") ||
!try_to_run_init_process("/bin/sh"))
return 0;

继续看看这个函数怎么实现的

1
2
3
4
5
6
7
8
9
10
11
12
13
static int try_to_run_init_process(const char *init_filename)
{
int ret;

ret = run_init_process(init_filename);

if (ret && ret != -ENOENT) {
pr_err("Starting init: %s exists but couldn't execute it (error %d)\n",
init_filename, ret);
}

return ret;
}

然后就看到调用了run_iniit_process函数,继续看实现

1
2
3
4
5
6
7
static int run_init_process(const char *init_filename)
{
argv_init[0] = init_filename;
return do_execve(getname_kernel(init_filename),
(const char __user *const __user *)argv_init,
(const char __user *const __user *)envp_init);
}

到这里就知道了,实际就是通过execve()来运行init程序的了。

到这里第一章就结束了,小白啃的太费劲了。然后下面简单画了下流程图。

kernel1