Unstanding Kdump (Loading Part)
Table of Contents
1. Introduction2. Routine of User Mode
3. Routine of Kernel Mode
1. Introduction
kdump, based on kexec, is to debug Linux Kernel. Currently, kdump and kexec are integration in kexec-tools program. After compiling the program, please use "build/sbin/kexec" and "-l" parameter to execute kexec function. Instead, "-p" parameter is for kdump. More difference between kexec and kdump lists below.1. kexec's second kernel will overwrite first kernel.
2. If you want to use kdump, you should build the first kernel with "crashkernel=x@y" in cmdline. After second kernel bring up, you will found the cmdline is added, by kexec-tools, two parameters: "-mem=A" and "-elfcorehdr=B". "-mem" means the second kernel's available memory size. "-elfcorehdr" tell to kdump where could be got debug information.
Now, let's analyze the kdump load routine simply.
2. Routine of User Mode
1061 int main(int argc, char *argv[])
1062 {
1063 int do_load = 1;
1064 int do_exec = 0;
1065 int do_load_jump_back_helper = 0;
1066 int do_shutdown = 1;
1067 int do_sync = 1;
1068 int do_ifdown = 0;
1069 int do_unload = 0;
1070 int do_reuse_initrd = 0;
1071 void *entry = 0;
1072 char *type = 0;
1073 char *endptr;
1074 int opt;
1075 int result = 0;
1076 int fileind;
1077 static const struct option options[] = {
1078 KEXEC_ALL_OPTIONS
1079 { 0, 0, 0, 0},
1080 };
1081 static const char short_options[] = KEXEC_ALL_OPT_STR;
1082
1083 while ((opt = getopt_long(argc, argv, short_options,
1084 options, 0)) != -1) {
1085 switch(opt) {
1086 case '?':
1087 case OPT_HELP:
1088 usage();
1089 return 0;
1090 case OPT_VERSION:
…
1152 case OPT_PANIC:
1153 do_load = 1;
1154 do_exec = 0;
1155 do_shutdown = 0;
1156 do_sync = 0;
1157 kexec_flags = KEXEC_ON_CRASH;
1158 break;
…
1187 if ((kexec_flags & KEXEC_ON_CRASH) && !is_crashkernel_mem_reserved()) {
1188 printf("Memory for crashkernel is not reserved\n");
1189 printf("Please reserve memory by passing ");
1190 printf("\"crashkernel=X@Y\" parameter to the kernel\n");
1191 die("Then try loading kdump kernel\n");
1192 }
1193
…
1201 fileind = optind;
1202 /* Reset getopt for the next pass; called in other source modules */
1203 opterr = 1;
1204 optind = 1;
1205
1206 result = arch_process_options(argc, argv);
…
1227 if (do_load && (result == 0)) {
1228 result = my_load(type, fileind, argc, argv, kexec_flags, entry);
1229 }
…
main->my_load:
648 /*
649 * Load the new kernel
650 */
651 static int my_load(const char *type, int fileind, int argc, char **argv,
652 unsigned long kexec_flags, void *entry)
653 {
654 char *kernel;
655 char *kernel_buf;
656 off_t kernel_size;
657 int i = 0;
658 int result;
659 struct kexec_info info;
660 long native_arch;
661 int guess_only = 0;
662
663 memset(&info, 0, sizeof(info));
664 info.segment = NULL;
665 info.nr_segments = 0;
666 info.entry = NULL;
667 info.backup_start = 0;
668 info.kexec_flags = kexec_flags;
669
670 result = 0;
671 if (argc - fileind <= 0) {
672 fprintf(stderr, "No kernel specified\n");
673 usage();
674 return -1;
675 }
676 kernel = argv[fileind];
677 /* slurp in the input kernel */
678 kernel_buf = slurp_decompress_file(kernel, &kernel_size);
679
680 dbgprintf("kernel: %p kernel_size: %lx\n",
681 kernel_buf, kernel_size);
682
683 if (get_memory_ranges(&info.memory_range, &info.memory_ranges,
684 info.kexec_flags) < 0 || info.memory_ranges == 0) {
685 fprintf(stderr, "Could not get memory layout\n");
686 return -1;
687 }
688 /* if a kernel type was specified, try to honor it */
689 if (type) {
690 for (i = 0; i < file_types; i++) {
691 if (strcmp(type, file_type[i].name) == 0)
692 break;
693 }
694 if (i == file_types) {
695 fprintf(stderr, "Unsupported kernel type %s\n", type);
696 return -1;
697 } else {
698 /* make sure our file is really of that type */
699 if (file_type[i].probe(kernel_buf, kernel_size) < 0)
700 guess_only = 1;
701 }
702 }
703 if (!type || guess_only) {
704 for (i = 0; i < file_types; i++) {
705 if (file_type[i].probe(kernel_buf, kernel_size) >= 0)
706 break;
707 }
708 if (i == file_types) {
709 fprintf(stderr, "Cannot determine the file type "
710 "of %s\n", kernel);
711 return -1;
712 } else {
713 if (guess_only) {
714 fprintf(stderr, "Wrong file type %s, "
715 "file matches type %s\n",
716 type, file_type[i].name);
717 return -1;
718 }
719 }
720 }
721 /* Figure out our native architecture before load */
722 native_arch = physical_arch();
723 if (native_arch < 0) {
724 return -1;
725 }
726 info.kexec_flags |= native_arch;//Now there are two flags, KEXEC_ON_CRASH|KEXEC_ARCH_ARM
727
728 result = file_type[i].load(argc, argv, kernel_buf, kernel_size, &info);//0: success
main->my_load->zImage_arm_load:
218 int zImage_arm_load(int argc, char **argv, const char *buf, off_t len,
219 struct kexec_info *info)
220 {
221 unsigned long base;
222 unsigned int atag_offset = 0x1000; /* 4k offset from memory start */
223 unsigned int offset = 0x8000; /* 32k offset from memory start */
224 const char *command_line;
225 char *modified_cmdline = NULL;
226 off_t command_line_len;
227 const char *ramdisk;
228 char *ramdisk_buf;
229 int opt;
230 int use_atags;
231 char *dtb_buf;
232 off_t dtb_length;
233 char *dtb_file;
234 off_t dtb_offset;
235 dbgprintf("buf:%p, len:%lx\n",buf,len);
236 /* See options.h -- add any more there, too. */
237 static const struct option options[] = {
238 KEXEC_ARCH_OPTIONS
239 { "command-line", 1, 0, OPT_APPEND },
240 { "append", 1, 0, OPT_APPEND },
241 { "initrd", 1, 0, OPT_RAMDISK },
242 { "ramdisk", 1, 0, OPT_RAMDISK },
243 { "dtb", 1, 0, OPT_DTB },
244 { "atags", 0, 0, OPT_ATAGS },
245 { 0, 0, 0, 0 },
246 };
247 static const char short_options[] = KEXEC_ARCH_OPT_STR "a:r:";
248
249 /*
250 * Parse the command line arguments
251 */
252 command_line = 0;
253 command_line_len = 0;
254 ramdisk = 0;
255 ramdisk_buf = 0;
256 initrd_size = 0;
257 use_atags = 0;
258 dtb_file = NULL;
259 while((opt = getopt_long(argc, argv, short_options, options, 0)) != -1) {
260 switch(opt) {
261 default:
262 /* Ignore core options */
263 if (opt < OPT_ARCH_MAX) {
264 break;
265 }
266 case '?':
267 usage();
268 return -1;
269 case OPT_APPEND:
270 command_line = optarg;
271 break;
272 case OPT_RAMDISK:
273 ramdisk = optarg;
274 break;
275 case OPT_DTB:
276 dtb_file = optarg;
277 break;
278 case OPT_ATAGS:
279 use_atags = 1;
280 break;
281 }
282 }
283
284 if (use_atags && dtb_file) {
285 fprintf(stderr, "You can only use ATAGs if you don't specify a "
286 "dtb file.\n");
287 return -1;
288 }
289
290 if (command_line) {
291 command_line_len = strlen(command_line) + 1;
292 if (command_line_len > COMMAND_LINE_SIZE)
293 command_line_len = COMMAND_LINE_SIZE;
294 }
295 if (ramdisk) {//Read init ramdisk to memory.
296 ramdisk_buf = slurp_file(ramdisk, &initrd_size);
297 }
298
299 /*
300 * If we are loading a dump capture kernel, we need to update kernel
301 * command line and also add some additional segments.
302 */
303 if (info->kexec_flags & KEXEC_ON_CRASH) {
304 uint64_t start, end;
305
306 modified_cmdline = xmalloc(COMMAND_LINE_SIZE);
307 if (!modified_cmdline)
308 return -1;
309
310 if (command_line) {
311 (void) strncpy(modified_cmdline, command_line,
312 COMMAND_LINE_SIZE);
313 modified_cmdline[COMMAND_LINE_SIZE - 1] = '\0';
314 }
315
316 if (load_crashdump_segments(info, modified_cmdline) < 0) {
317 free(modified_cmdline);
318 return -1;
319 }
320
321 command_line = modified_cmdline;
322 command_line_len = strlen(command_line) + 1;
323
324 /*
325 * We put the dump capture kernel at the start of crashkernel
326 * reserved memory.
327 */
328 if (parse_iomem_single("Crash kernel\n", &start, &end)) {
329 /*
330 * No crash kernel memory reserved. We cannot do more
331 * but just bail out.
332 */
333 return -1;
334 }
335 base = start;
336 } else {
337 dbgprintf("len:%lx,offset:%ux,len+offset:%lx\n",len,offset,len+offset);
338 base = locate_hole(info,len+offset,0,0,ULONG_MAX,INT_MAX);
339 }
303~314,321~322 handle the cmdline. 335 should be paid more attention. Now let's look at 316Line.
main->my_load->zImage_arm_load-> load_crashdump_segments
255 /**
256 * load_crashdump_segments() - loads additional segments needed for kdump
257 * @info: kexec info structure
258 * @mod_cmdline: kernel command line
259 *
260 * This function loads additional segments which are needed for the dump capture
261 * kernel. It also updates kernel command line passed in @mod_cmdline to have
262 * right parameters for the dump capture kernel.
263 *
264 * Return %0 in case of success and %-1 in case of error.
265 */
266 int load_crashdump_segments(struct kexec_info *info, char *mod_cmdline)
267 {
268 unsigned long elfcorehdr;
269 unsigned long bufsz;
270 void *buf;
271 int err;
272
273 /*
274 * First fetch all the memory (RAM) ranges that we are going to pass to
275 * the crashdump kernel during panic.
276 */
277 err = crash_get_memory_ranges();
main->my_load->zImage_arm_load-> load_crashdump_segments->crash_get_memory_ranges
155 static int crash_get_memory_ranges(void)
156 {
157 /*
158 * First read all memory regions that can be considered as
159 * system memory including the crash area.
160 */
161 kexec_iomem_for_each_line(NULL, crash_range_callback, NULL);
…
167
168 /*
169 * Exclude memory reserved for crashkernel (this may result a split memory
170 * region).
171 */
172 crash_exclude_range();
173
174 /*
175 * Make sure that the memory regions are sorted.
176 */
177 qsort(usablemem_rgns.ranges, usablemem_rgns.size,
178 sizeof(*usablemem_rgns.ranges), range_cmp);
179
180 return 0;
181 }
Line161, kexec_iomem_for_each_line() will:
2. Lookup "Crash kernel" in /proc/iomem, and stored at crash_reserved_mem.
Then, Line 172, crash_exclude_range() will delete crash_reserved_mem range from "System RAM" range in usablemem_rgns.ranges. So, usablemem_rgns will splite two ranges.
main->my_load->zImage_arm_load-> load_crashdump_segments (cont'd)
281 /*
282 * Now that we have memory regions sorted, we can use first memory
283 * region as PHYS_OFFSET.
284 */
285 phys_offset = usablemem_rgns.ranges->start;
286 dbgprintf("phys_offset: %#lx\n", phys_offset);
287
288 err = crash_create_elf32_headers(info, &elf_info,
289 usablemem_rgns.ranges,
290 usablemem_rgns.size, &buf, &bufsz,
291 ELF_CORE_HEADER_ALIGN);
main->my_load->zImage_arm_load-> load_crashdump_segments-> crash_create_elf32_headers\(FUNC\)
elf_info is defined below.
48 static struct crash_elf_info elf_info = {
49 .class = ELFCLASS32,
50 .data = ELFDATA2LSB,
51 .machine = EM_ARM,
52 .page_offset = PAGE_OFFSET,
53 };
29 int FUNC(struct kexec_info *info,
30 struct crash_elf_info *elf_info,
31 struct memory_range *range, int ranges,
32 void **buf, unsigned long *size, unsigned long align) //buf and size is for return.
33 {
34 EHDR *elf;
35 PHDR *phdr;
36 int i;
37 unsigned long sz;
38 char *bufp;
39 long int nr_cpus = 0;
40 uint64_t notes_addr, notes_len;
41 uint64_t vmcoreinfo_addr, vmcoreinfo_len;
42 int has_vmcoreinfo = 0;
43 int (*get_note_info)(int cpu, uint64_t *addr, uint64_t *len);
44
45 if (xen_present())
46 nr_cpus = xen_get_nr_phys_cpus();
47 else
48 nr_cpus = sysconf(_SC_NPROCESSORS_CONF);
49
50 if (nr_cpus < 0) {
51 return -1;
52 }
53
54 if (xen_present()) {
55 if (!get_xen_vmcoreinfo(&vmcoreinfo_addr, &vmcoreinfo_len))
56 has_vmcoreinfo = 1;
57 } else
58 if (!get_kernel_vmcoreinfo(&vmcoreinfo_addr, &vmcoreinfo_len))
59 has_vmcoreinfo = 1;
get_kernel_vmcoreinfo get vmcore information's address and length from /sys/kernel/vmcoreinfo. Let's see the detail below.
main->my_load->zImage_arm_load-> load_crashdump_segments-> crash_create_elf32_headers\(FUNC\)-> get_kernel_vmcoreinfo
139 /* Returns the physical address of start of crash notes buffer for a kernel. */
140 int get_kernel_vmcoreinfo(uint64_t *addr, uint64_t *len)
141 {
142 return get_vmcoreinfo("/sys/kernel/vmcoreinfo", addr, len);
143 }
main->my_load->zImage_arm_load-> load_crashdump_segments-> crash_create_elf32_headers\(FUNC\)-> get_kernel_vmcoreinfo-> get_vmcoreinfo
113 static int get_vmcoreinfo(const char *kdump_info, uint64_t *addr, uint64_t *len)
114 {
115 char line[MAX_LINE];
116 int count;
117 FILE *fp;
118 unsigned long long temp, temp2;
119
120 *addr = 0;
121 *len = 0;
122
123 if (!(fp = fopen(kdump_info, "r")))
124 return -1;
125
126 if (!fgets(line, sizeof(line), fp))
127 die("Cannot parse %s: %s\n", kdump_info, strerror(errno));
128 count = sscanf(line, "%Lx %Lx", &temp, &temp2);
129 if (count != 2)
130 die("Cannot parse %s: %s\n", kdump_info, strerror(errno));
131
132 *addr = (uint64_t) temp;
133 *len = (uint64_t) temp2;
134
135 fclose(fp);
136 return 0;
137 }
main->my_load->zImage_arm_load-> load_crashdump_segments-> crash_create_elf32_headers\(FUNC\) (cont'd)
61 sz = sizeof(EHDR) + (nr_cpus + has_vmcoreinfo) * sizeof(PHDR) +
62 ranges * sizeof(PHDR);
EHDR is `struct Elf32_Ehdr`. PHDR is program header. Now there are two memory ranges, and vmcore-info, So sz= sizeof(EHDR)+sizeof(PHDR)+2*sizeof(PHDR).
main->my_load->zImage_arm_load-> load_crashdump_segments-> crash_create_elf32_headers\(FUNC\) (cont'd)
63
64 /*
65 * Certain architectures such as x86_64 and ia64 require a separate
66 * PT_LOAD program header for the kernel. This is controlled through
67 * elf_info->kern_size.
68 *
69 * The separate PT_LOAD program header is required either because the
70 * kernel is mapped at a different location than the rest of the
71 * physical memory or because we need to support relocatable kernels.
72 * Or both as on x86_64.
73 *
74 * In the relocatable kernel case this PT_LOAD segment is used to tell
75 * where the kernel was actually loaded which may be different from
76 * the load address present in the vmlinux file.
77 *
78 * The extra kernel PT_LOAD program header results in a vmcore file
79 * which is larger than the size of the physical memory. This is
80 * because the memory for the kernel is present both in the kernel
81 * PT_LOAD program header and in the physical RAM program headers.
82 */
83
84 if (elf_info->kern_size && !xen_present()) {
85 sz += sizeof(PHDR);
86 }
elf_info->kern_size is zero here. Next, will fill elf header.
99 sz = _ALIGN(sz, align);
100
101 bufp = xmalloc(sz);
102 memset(bufp, 0, sz);
103
104 *buf = bufp;
105 *size = sz;
106
107 /* Setup ELF Header*/
108 elf = (EHDR *) bufp;
109 bufp += sizeof(EHDR);
110 memcpy(elf->e_ident, ELFMAG, SELFMAG);
111 elf->e_ident[EI_CLASS] = elf_info->class;
112 elf->e_ident[EI_DATA] = elf_info->data;
113 elf->e_ident[EI_VERSION]= EV_CURRENT;
114 elf->e_ident[EI_OSABI] = ELFOSABI_NONE;
115 memset(elf->e_ident+EI_PAD, 0, EI_NIDENT-EI_PAD);
116 elf->e_type = ET_CORE;
117 elf->e_machine = crash_architecture(elf_info);
118 elf->e_version = EV_CURRENT;
119 elf->e_entry = 0;
120 elf->e_phoff = sizeof(EHDR);
121 elf->e_shoff = 0;
122 elf->e_flags = 0;
123 elf->e_ehsize = sizeof(EHDR);
124 elf->e_phentsize= sizeof(PHDR);
125 elf->e_phnum = 0;
126 elf->e_shentsize= 0;
127 elf->e_shnum = 0;
128 elf->e_shstrndx = 0;
Then, get crash notes information by get_crash_notes_per_cpu. This function will read "/sys/devices/system/cpu/cpu0/crash_notes" and get address. Meanwhile, the length is 1024.
after get the cpu crash notes, then fill to corresponding program header.
141 for (i = 0; i < nr_cpus; i++) {
142 if (get_note_info(i, ¬es_addr, ¬es_len) < 0) {
143 /* This cpu is not present. Skip it. */
144 continue;
145 }
146
147 phdr = (PHDR *) bufp;
148 bufp += sizeof(PHDR);
149 phdr->p_type = PT_NOTE;
150 phdr->p_flags = 0;
151 phdr->p_offset = phdr->p_paddr = notes_addr;
152 phdr->p_vaddr = 0;
153 phdr->p_filesz = phdr->p_memsz = notes_len;
154 /* Do we need any alignment of segments? */
155 phdr->p_align = 0;
156
157 /* Increment number of program headers. */
158 (elf->e_phnum)++;
159 dbgprintf_phdr("Elf header", phdr);
160 }
After get cpu crash notes, it's turn to be vmcoreinfo. It get above and stored vmcoreinfo_addr and vmcoreinfo_len.
162 if (has_vmcoreinfo && !(info->kexec_flags & KEXEC_PRESERVE_CONTEXT)) {
163 phdr = (PHDR *) bufp;
164 bufp += sizeof(PHDR);
165 phdr->p_type = PT_NOTE;
166 phdr->p_flags = 0;
167 phdr->p_offset = phdr->p_paddr = vmcoreinfo_addr;
168 phdr->p_vaddr = 0;
169 phdr->p_filesz = phdr->p_memsz = vmcoreinfo_len;
170 /* Do we need any alignment of segments? */
171 phdr->p_align = 0;
172
173 (elf->e_phnum)++;
174 dbgprintf_phdr("vmcoreinfo header", phdr);
175 }
Then, it's the time to set program header of Memory ranges.
194 /* Setup PT_LOAD type program header for every system RAM chunk.
195 * A seprate program header for Backup Region*/
196 for (i = 0; i < ranges; i++, range++) {
197 unsigned long long mstart, mend;
198 if (range->type != RANGE_RAM)
199 continue;
200 mstart = range->start;
201 mend = range->end;
202 if (!mstart && !mend)
203 continue;
204 phdr = (PHDR *) bufp;
205 bufp += sizeof(PHDR);
206 phdr->p_type = PT_LOAD;
207 phdr->p_flags = PF_R|PF_W|PF_X;
208 phdr->p_offset = mstart;
209
210 if (mstart == info->backup_src_start
211 && (mend - mstart + 1) == info->backup_src_size)
212 phdr->p_offset = info->backup_start;
213
214 /* We already prepared the header for kernel text. Map
215 * rest of the memory segments to kernel linearly mapped
216 * memory region.
217 */
218 phdr->p_paddr = mstart;
219 phdr->p_vaddr = phys_to_virt(elf_info, mstart);// paddr + elf_info->page_offset - phys_offset
220 phdr->p_filesz = phdr->p_memsz = mend - mstart + 1;
221 /* Do we need any alignment of segments? */
222 phdr->p_align = 0;
223
224 /* HIGMEM has a virtual address of -1 */
225
226 if (elf_info->lowmem_limit
227 && (mend > (elf_info->lowmem_limit - 1)))
228 phdr->p_vaddr = -1;
229
230 /* Increment number of program headers. */
231 (elf->e_phnum)++;
232 dbgprintf_phdr("Elf header", phdr);
233 }
234 return 0;
235 }
Now, in load_crashdump_segments function, we has allocated ELF core header @ the end of the memory area reserved for the crashkernel.
From that, elf file is composed of
1. elf header2. crash notes program header for pre CPU (Note: if you processor contains Hyper-Threading technology, the core is not equal to physical core. But, here, it's logic core. Get more information by http://en.wikipedia.org/wiki/Hyper-threading. By the way, for ARM processor, Hyper-Threading technology is not be applied). crash notes's stored address could be get by reading "/sys/devices/system/cpu/cpuN/crash_notes"
3. vmcore info program header. vmcore info stored address could be get by reading /sys/kernel/vmcoreinfo.
4. memory range program header. This is got from usablemem_rgns parameters.
main->my_load->zImage_arm_load-> load_crashdump_segments (cont'd)
292 if (err)
293 return err;
294
295 /*
296 * We allocate ELF core header from the end of the memory area reserved
297 * for the crashkernel. We align the header to SECTION_SIZE (which is
298 * 1MB) so that available memory passed in kernel command line will be
299 * aligned to 1MB. This is because kernel create_mapping() wants memory
300 * regions to be aligned to SECTION_SIZE.
301 */
302 elfcorehdr = add_buffer_phys_virt(info, buf, bufsz, bufsz, 1 << 20,
303 crash_reserved_mem.start,
304 crash_reserved_mem.end, -1, 0);
305
306 dbgprintf("elfcorehdr: %#lx\n", elfcorehdr);
307 cmdline_add_elfcorehdr(mod_cmdline, elfcorehdr);
308
309 /*
310 * Add 'mem=size' parameter to dump capture kernel command line. This
311 * prevents the dump capture kernel from using any other memory regions
312 * which belong to the primary kernel.
313 */
314 cmdline_add_mem(mod_cmdline, elfcorehdr - crash_reserved_mem.start);
315
316 dump_memory_ranges();
317 dbgprintf("kernel command line: \"%s\"\n", mod_cmdline);
318
319 return 0;
320 }
302Line get elfcorehdr, which is pre-allocated for elf header and program header. Then add the "elfcorehdr=X" to cmdline, which is used to capture kernel. 314Line will add "mem=Z" to cmdline. The Z is equal to (Crash reserved memory size - 1MB).
main->my_load->zImage_arm_load-> load_crashdump_segments-> add_buffer_phys_virt
First, let's walk through the parameters. info is initialized in my_load. buf and bufsz is the elf header, program header buffer. memsz here is same to bufsz. buf_align is assigned by 1MB. buf_min and buf_max is reserved memory range. buf_end is -1, phys is 0.
357 unsigned long add_buffer_phys_virt(struct kexec_info *info,
358 const void *buf, unsigned long bufsz, unsigned long memsz,
359 unsigned long buf_align, unsigned long buf_min, unsigned long buf_max,
360 int buf_end, int phys)
361 {
362 unsigned long base;
363 int result;
364 int pagesize;
365
366 result = sort_segments(info);
367 if (result < 0) {
368 die("sort_segments failed\n");
369 }
370
371 /* Round memsz up to a multiple of pagesize */
372 pagesize = getpagesize();
373 memsz = _ALIGN(memsz, pagesize);
374
375 base = locate_hole(info, memsz, buf_align, buf_min, buf_max, buf_end);
376 if (base == ULONG_MAX) {
377 die("locate_hole failed\n");
378 }
379
380 add_segment_phys_virt(info, buf, bufsz, base, memsz, phys);
381 return base;
382 }
Here base is the last 1MB of reserved memory from crashkernel=x@y. Then add the segment to info by add_segment_phys_virt on Line 380.
main->my_load->zImage_arm_load(cont'd)
From 335Line, base is the start of reserved memory range.
340
341 if (base == ULONG_MAX)
342 return -1;
343
344 /* assume the maximum kernel compression ratio is 4,
345 * and just to be safe, place ramdisk after that
346 */
347 initrd_base = base + len * 4;
348
349 if (use_atags) {
350 /*
351 * use ATAGs from /proc/atags
352 */
353 if (atag_arm_load(info, base + atag_offset,
354 command_line, command_line_len,
355 ramdisk_buf, initrd_size, initrd_base) == -1)
356 return -1;
…
427 add_segment(info, buf, len, base + offset, len);
428
429 info->entry = (void*)base + offset;
430
431 return 0;
432 }
Now we should deal with the parameter "-atags" by atag_arm_load reading /proc/atags @353. Then add the atags to segments in info. At the end, add the kernel buffer into segments.
main->my_load:
…
777
778 result = kexec_load(
779 info.entry, info.nr_segments, info.segment, info.kexec_flags);
…
788 return result;
789 }
After fill the elf, then add elf information, atags and kernel buffer to segments. Then pass the segments to sys_kexec_load system call.
Now, we have analyzed the routine of kexec
loading in user space. Next, the kernel space code is analysed.
3. Routine of Kernel Mode
In kexec-tools source code, info->entry
is be filled by crash memory reserved adding 32KB offset.
218 int zImage_arm_load(int argc, char
**argv, const char *buf, off_t len,
219
struct kexec_info *info)
…
328 if
(parse_iomem_single("Crash kernel\n", &start, &end)) {
329 /*
330 * No crash kernel
memory reserved. We cannot do more
331 * but just bail out.
332 */
333 return -1;
334 }
335 base = start;
…
427
add_segment(info, buf, len, base + offset, len);
428
429
print_segment(stderr,info);
430
info->entry = (void*)base + offset;
}
For our case, the crashkernel =64MB@98MB.
So, info->entry=64MB+32KB. If we use kdump as “kexec -p”, the flags is KEXEC_ARCH_ARM|
KEXEC_ON_CRASH.
The parameter segments is allocated by
kexec-tools for recording atags, ramdisk, kernel and elfcorehdr. Add atags,ramdisk,kernel
buffer to info->segment[] call add_segment(). add elfcorehdr call
add_segment_phys_virt(). The picture below illustrate the segments.
941 SYSCALL_DEFINE4(kexec_load, unsigned
long, entry, unsigned long, nr_segments,
942 struct kexec_segment __user *,
segments, unsigned long, flags)
943
{
944
struct kimage **dest_image, *image;
945
int result;
…
969
image = NULL;
970
result = 0;
971
972
/* Because we write directly to the reserved memory
973
* region when loading crash kernels we need a mutex here to
974
* prevent multiple crash kernels
from attempting to load
975
* simultaneously, and to prevent a crash kernel from loading
976
* over the top of a in use crash kernel.
977
*
978
* KISS: always take the mutex.
979
*/
980
if (!mutex_trylock(&kexec_mutex))
981 return -EBUSY;
…
984
if (flags & KEXEC_ON_CRASH)
985 dest_image =
&kexec_crash_image;
986
if (nr_segments > 0) {
987
unsigned long i;
988
989 /* Loading another kernel to
reboot into */
…
994 else if (flags &
KEXEC_ON_CRASH) {
995 /* Free any current
crash dump kernel before
996 * we corrupt it.
997 */
998
kimage_free(xchg(&kexec_crash_image, NULL));
999 result =
kimage_crash_alloc(&image, entry,
1000
nr_segments, segments);
1001
crash_map_reserved_pages();//Null here.
1002 }
1003 if (result)
1004 goto out;
1005
1006 if (flags & KEXEC_PRESERVE_CONTEXT)
1007
image->preserve_context = 1;
1008 result =
machine_kexec_prepare(image);
1009 if (result)
1010 goto out;
1011
1012 for (i = 0; i < nr_segments; i++) {
1013 result =
kimage_load_segment(image, &image->segment[i]);
1014 if (result)
1015 goto out;
1016 }
1017 kimage_terminate(image);
1018 if (flags &
KEXEC_ON_CRASH)
1019
crash_unmap_reserved_pages();
1020
}
1021
/* Install the new kernel, and
Uninstall the old */
1022
image = xchg(dest_image, image);
1023
1024 out:
1025
mutex_unlock(&kexec_mutex);
1026
kimage_free(image);
1027
1028
return result;
1029 }
kexec_load-> kimage_crash_alloc
In this
function, There are three steps below.
1.
Allocate and initializing a struct kimage object.(Corresponding with Line 999)
2.
Allocate a control page(The control page will be filled when soft reboot happens.)(Corresponding
with Line 999)
3.
Copy buffer recording in info->segment[] to memory reserved range(1012~1016Line).
272 static int kimage_crash_alloc(struct
kimage **rimage, unsigned long entry,
273 unsigned long
nr_segments,
274 struct
kexec_segment __user *segments)
275 {
276
int result;
277
struct kimage *image;
278
unsigned long i;
279
280
image = NULL;
281
/* Verify we have a valid entry point */
282
if ((entry < crashk_res.start) || (entry > crashk_res.end)) {
283 result = -EADDRNOTAVAIL;
284 goto out;
285
}
286
287
/* Allocate and initialize a controlling structure */
288
result = do_kimage_alloc(&image, entry, nr_segments, segments);
289
if (result)
290 goto out;
291
292
/* Enable the special crash kernel control page
293
* allocation policy.
294
*/
295
image->control_page = crashk_res.start;
296
image->type = KEXEC_TYPE_CRASH;
297
298
/*
299
* Verify we have good destination addresses. Normally
300
* the caller is responsible for making certain we don't
301
* attempt to load the new image into invalid or reserved
302
* areas of RAM. But crash kernels
are preloaded into a
303
* reserved area of ram. We must
ensure the addresses
304
* are in the reserved area otherwise preloading the
305
* kernel could corrupt things.
306
*/
307
result = -EADDRNOTAVAIL;
308
for (i = 0; i < nr_segments; i++) {
309 unsigned long mstart, mend;
310
311 mstart = image->segment[i].mem;
312 mend = mstart +
image->segment[i].memsz - 1;
313 /* Ensure we are within the
crash kernel limits */
314 if ((mstart <
crashk_res.start) || (mend > crashk_res.end))
315 goto out;
316
}
317
318
/*
319
* Find a location for the control code buffer, and add
320
* the vector of segments so that it's pages will also be
321
* counted as destination pages.
322
*/
323
result = -ENOMEM;
324
image->control_code_page = kimage_alloc_control_pages(image,
325
get_order(KEXEC_CONTROL_PAGE_SIZE));
326
if (!image->control_code_page) {
327 printk(KERN_ERR "Could
not allocate control_code_buffer\n");
328 goto out;
329
}
330
331
result = 0;
332 out:
333
if (result == 0)
334 *rimage = image;
335
else
336 kfree(image);
337
338
return result;
339 }
288Line
allocate a struct kimage object and initialize it.
Line 308~316 will
check segment’s memsz range. It won’t be out of crashkernel reserved memory
range.
kexec_load-> kimage_crash_alloc-> do_kimage_alloc
118 static int do_kimage_alloc(struct
kimage **rimage, unsigned long entry,
119 unsigned long
nr_segments,
120 struct
kexec_segment __user *segments)
121
{
122
size_t segment_bytes;
123
struct kimage *image;
124
unsigned long i;
125
int result;
126
127
/* Allocate a controlling structure */
128
result = -ENOMEM;
129
image = kzalloc(sizeof(*image), GFP_KERNEL);
130
if (!image)
131 goto out;
132
133
image->head = 0;
134
image->entry =
&image->head;
135
image->last_entry = &image->head;
136
image->control_page = ~0; /* By default this does not apply */
137
image->start = entry;
138
image->type = KEXEC_TYPE_DEFAULT;
139
140
/* Initialize the list of control pages */
141
INIT_LIST_HEAD(&image->control_pages);
142
143
/* Initialize the list of destination pages */
144
INIT_LIST_HEAD(&image->dest_pages);
145
146
/* Initialize the list of unusable pages */
147
INIT_LIST_HEAD(&image->unuseable_pages);
148
149
/* Read in the segments */
150
image->nr_segments = nr_segments;
151
segment_bytes = nr_segments * sizeof(*segments);
152
result = copy_from_user(image->segment, segments, segment_bytes);
153
if (result) {
154 result = -EFAULT;
155 goto out;
156
}
157
158
/*
159
* Verify we have good destination addresses. The caller is
160
* responsible for making certain we don't attempt to load
161
* the new image into invalid or reserved areas of RAM. This
162
* just verifies it is an address we can use.
163
*
164
* Since the kernel does everything in page size chunks ensure
165
* the destination addresses are page aligned. Too many
166
* special cases crop of when we don't do this. The most
167
* insidious is getting overlapping destination addresses
168
* simply because addresses are changed to page size
169
* granularity.
170
*/
171
result = -EADDRNOTAVAIL;
172
for (i = 0; i < nr_segments; i++) {
173 unsigned long mstart, mend;
174
175 mstart =
image->segment[i].mem;
176 mend = mstart + image->segment[i].memsz;
177 if ((mstart & ~PAGE_MASK)
|| (mend & ~PAGE_MASK))
178 goto out;
179 if (mend >=
KEXEC_DESTINATION_MEMORY_LIMIT)
180 goto out;
181
}
182
183
/* Verify our destination addresses do not overlap.
184
* If we alloed overlapping destination addresses
185
* through very weird things can happen with no
186
* easy explanation as one segment stops on another.
187
*/
188
result = -EINVAL;
189
for (i = 0; i < nr_segments; i++) {
190 unsigned long mstart, mend;
191
unsigned long j;
192
193 mstart =
image->segment[i].mem;
194 mend = mstart + image->segment[i].memsz;
195 for (j = 0; j < i; j++) {
196 unsigned long pstart,
pend;
197 pstart =
image->segment[j].mem;
198 pend = pstart + image->segment[j].memsz;
199 /* Do the segments
overlap ? */
200 if ((mend > pstart)
&& (mstart < pend))
201 goto out;
202 }
203
}
204
205
/* Ensure our buffer sizes are strictly less than
206
* our memory sizes. This should
always be the case,
207
* and it is easier to check up front than to be surprised
208
* later on.
209
*/
210
result = -EINVAL;
211
for (i = 0; i < nr_segments; i++) {
212 if (image->segment[i].bufsz
> image->segment[i].memsz)
213 goto out;
214
}
215
216
result = 0;
217
out:
218
if (result == 0)
219 *rimage = image;
220
else
221 kfree(image);
222
223
return result;
224
225
}
kexec_load-> kimage_crash_alloc->kimage_alloc_control_pages-> kimage_alloc_crash_control_pages
There function complete step 2, which lists
before.
467 static struct page
*kimage_alloc_crash_control_pages(struct kimage *image,
468
unsigned int order)
469
{
470
/* Control pages are special, they are the intermediaries
471
* that are needed while we copy the rest of the pages
472
* to their final resting place.
As such they must
473
* not conflict with either the destination addresses
474
* or memory the kernel is already using.
475
*
476
* Control pages are also the only pags we must allocate
477
* when loading a crash kernel.
All of the other pages
478
* are specified by the segments and we just memcpy
479
* into them directly.
480
*
481
* The only case where we really need more than one of
482
* these are for architectures where we cannot disable
483
* the MMU and must instead generate an identity mapped
484
* page table for all of the memory.
485
*
486
* Given the low demand this implements a very simple
487 * allocator that finds the first hole
of the appropriate
488
* size in the reserved memory region, and allocates all
489
* of the memory up to and including the hole.
490
*/
491
unsigned long hole_start, hole_end, size;
492
struct page *pages;
493
494
pages = NULL;
495
size = (1 << order) << PAGE_SHIFT;
496
hole_start = (image->control_page + (size - 1)) & ~(size - 1);
497
hole_end = hole_start + size - 1;
498 while (hole_end <= crashk_res.end)
{
499 unsigned long i;
500
501 if (hole_end >
KEXEC_CRASH_CONTROL_MEMORY_LIMIT)
502 break;
503 if (hole_end > crashk_res.end)
504 break;
505 /* See if I overlap any of the
segments */
506 for (i = 0; i <
image->nr_segments; i++) {
507 unsigned long mstart,
mend;
508
509 mstart =
image->segment[i].mem;
510 mend = mstart + image->segment[i].memsz - 1;
511 if ((hole_end >=
mstart) && (hole_start <= mend)) {
512 /* Advance the hole to the end of the segment
*/
513 hole_start =
(mend + (size - 1)) & ~(size - 1);
514 hole_end = hole_start + size - 1;
515 break;
516 }
517 }
518 /* If I don't overlap any
segments I have found my hole! */
519 if (i ==
image->nr_segments) {
520 pages =
pfn_to_page(hole_start >> PAGE_SHIFT);
521 break;
522 }
523
}
524
if (pages)
525 image->control_page =
hole_end;
526
527
return pages;
528
}
Next, we will copy the buffer to memory
which is allocated among crash memory reserved range. Complete step 3, which
lists before.
kexec_load ->kimage_load_segment->kimage_load_crash_segment:
845
static int kimage_load_crash_segment(struct kimage *image,
846 struct
kexec_segment *segment)
847
{
848
/* For crash dumps kernels we simply copy the data from
849
* user space to it's destination.
850
* We do things a page at a time for the sake of kmap.
851
*/
852
unsigned long maddr;
853
unsigned long ubytes, mbytes;
854
int result;
855
unsigned char __user *buf;
856
857 result = 0;
858
buf = segment->buf;
859
ubytes = segment->bufsz;
860
mbytes = segment->memsz;
861
maddr = segment->mem;
862 while (mbytes) {
863 struct page *page;
864 char *ptr;
865 size_t uchunk, mchunk;
866
867 page = pfn_to_page(maddr
>> PAGE_SHIFT);
868 if (!page) {
869 result = -ENOMEM;
870 goto out;
871 }
872 ptr = kmap(page);
873 ptr += maddr & ~PAGE_MASK;
874 mchunk = PAGE_SIZE - (maddr
& ~PAGE_MASK);
875 if (mchunk > mbytes)
876 mchunk = mbytes;
877
878 uchunk = mchunk;
879 if (uchunk > ubytes) {
880 uchunk = ubytes;
881 /* Zero the trailing
part of the page */
882 memset(ptr + uchunk,
0, mchunk - uchunk);
883 }
884 result = copy_from_user(ptr,
buf, uchunk);
885 kexec_flush_icache_page(page);
886 kunmap(page);
887 if (result) {
888
result = -EFAULT;
889 goto out;
890 }
891 ubytes -= uchunk;
892 maddr += mchunk;
893 buf += mchunk;
894 mbytes -= mchunk;
895
}
896
out:
897
return result;
898
}
评论
发表评论