Unstanding Kdump (Loading Part)


Table of Contents

1. Introduction
2. Routine of User Mode
3. Routine of Kernel Mode


1. Introduction

kdump, based on kexec, is to debug Linux Kernel. Currently, kdump and kexec are integration in kexec-tools programAfter compiling the program, please use "build/sbin/kexec" and "-l" parameter to execute kexec function. Instead, "-p" parameter is for kdump. More difference between kexec and kdump lists below.

1. kexec's second kernel will overwrite first kernel.
2. If you want to use kdump, you should build the first kernel with "crashkernel=x@y" in cmdline. After second kernel bring up, you will found the cmdline is added, by kexec-tools, two parameters: "-mem=A" and "-elfcorehdr=B". "-mem" means the second kernel's available memory size. "-elfcorehdr" tell to kdump where could be got debug information.

Now, let's analyze the kdump load routine simply.


2. Routine of User Mode



1061 int main(int argc, char *argv[])

1062 {

1063         int do_load = 1;
1064         int do_exec = 0;
1065         int do_load_jump_back_helper = 0;
1066         int do_shutdown = 1;
1067         int do_sync = 1;
1068         int do_ifdown = 0;
1069         int do_unload = 0;
1070         int do_reuse_initrd = 0;
1071         void *entry = 0;
1072         char *type = 0;
1073         char *endptr;
1074         int opt;
1075         int result = 0;
1076         int fileind;
1077         static const struct option options[] = {
1078                 KEXEC_ALL_OPTIONS
1079                 { 0, 0, 0, 0},
1080         };
1081         static const char short_options[] = KEXEC_ALL_OPT_STR;
1082
1083         while ((opt = getopt_long(argc, argv, short_options,
1084                                   options, 0)) != -1) {
1085                 switch(opt) {
1086                 case '?':
1087                 case OPT_HELP:
1088                         usage();
1089                         return 0;
1090                 case OPT_VERSION:
1152                 case OPT_PANIC:
1153                         do_load = 1;
1154                         do_exec = 0;
1155                         do_shutdown = 0;
1156                         do_sync = 0;
1157                         kexec_flags = KEXEC_ON_CRASH;
1158                         break;
1187         if ((kexec_flags & KEXEC_ON_CRASH) && !is_crashkernel_mem_reserved()) {
1188                 printf("Memory for crashkernel is not reserved\n");
1189                 printf("Please reserve memory by passing ");
1190                 printf("\"crashkernel=X@Y\" parameter to the kernel\n");
1191                 die("Then try loading kdump kernel\n");
1192         }
1193
1201         fileind = optind;
1202         /* Reset getopt for the next pass; called in other source modules */
1203         opterr = 1;
1204         optind = 1;
1205
1206         result = arch_process_options(argc, argv);
1227         if (do_load && (result == 0)) {
1228                 result = my_load(type, fileind, argc, argv, kexec_flags, entry);
1229         }
main->my_load:
648 /*
649  *      Load the new kernel
650  */
651 static int my_load(const char *type, int fileind, int argc, char **argv,
652                    unsigned long kexec_flags, void *entry)
653 {
654         char *kernel;
655         char *kernel_buf;
656         off_t kernel_size;
657         int i = 0;
658         int result;
659         struct kexec_info info;
660         long native_arch;
661         int guess_only = 0;
662
663         memset(&info, 0, sizeof(info));
664         info.segment = NULL;
665         info.nr_segments = 0;
666         info.entry = NULL;
667         info.backup_start = 0;
668         info.kexec_flags = kexec_flags;
669
670         result = 0;
671         if (argc - fileind <= 0) {
672                 fprintf(stderr, "No kernel specified\n");
673                 usage();
674                 return -1;
675         }
676         kernel = argv[fileind];
677         /* slurp in the input kernel */
678         kernel_buf = slurp_decompress_file(kernel, &kernel_size);
679
680         dbgprintf("kernel: %p kernel_size: %lx\n",
681                   kernel_buf, kernel_size);
682
683         if (get_memory_ranges(&info.memory_range, &info.memory_ranges,
684                 info.kexec_flags) < 0 || info.memory_ranges == 0) {
685                 fprintf(stderr, "Could not get memory layout\n");
686                 return -1;
687         }
688         /* if a kernel type was specified, try to honor it */
689         if (type) {
690                 for (i = 0; i < file_types; i++) {
691                         if (strcmp(type, file_type[i].name) == 0)
692                                 break;
693                 }
694                 if (i == file_types) {
695                         fprintf(stderr, "Unsupported kernel type %s\n", type);
696                         return -1;
697                 } else {
698                         /* make sure our file is really of that type */
699                         if (file_type[i].probe(kernel_buf, kernel_size) < 0)
700                                 guess_only = 1;
701                 }
702         }
703         if (!type || guess_only) {
704                 for (i = 0; i < file_types; i++) {
705                         if (file_type[i].probe(kernel_buf, kernel_size) >= 0)
706                                 break;
707                 }
708                 if (i == file_types) {
709                         fprintf(stderr, "Cannot determine the file type "
710                                         "of %s\n", kernel);
711                         return -1;
712                 } else {
713                         if (guess_only) {
714                                 fprintf(stderr, "Wrong file type %s, "
715                                         "file matches type %s\n",
716                                         type, file_type[i].name);
717                                 return -1;
718                         }
719                 }
720         }
721         /* Figure out our native architecture before load */
722         native_arch = physical_arch();
723         if (native_arch < 0) {
724                 return -1;
725         }
726         info.kexec_flags |= native_arch;//Now there are two flags, KEXEC_ON_CRASH|KEXEC_ARCH_ARM
727
728         result = file_type[i].load(argc, argv, kernel_buf, kernel_size, &info);//0: success
main->my_load->zImage_arm_load:
218 int zImage_arm_load(int argc, char **argv, const char *buf, off_t len,
219         struct kexec_info *info)
220 {
221         unsigned long base;
222         unsigned int atag_offset = 0x1000; /* 4k offset from memory start */
223         unsigned int offset = 0x8000;      /* 32k offset from memory start */
224         const char *command_line;
225         char *modified_cmdline = NULL;
226         off_t command_line_len;
227         const char *ramdisk;
228         char *ramdisk_buf;
229         int opt;
230         int use_atags;
231         char *dtb_buf;
232         off_t dtb_length;
233         char *dtb_file;
234         off_t dtb_offset;
235         dbgprintf("buf:%p, len:%lx\n",buf,len);
236         /* See options.h -- add any more there, too. */
237         static const struct option options[] = {
238                 KEXEC_ARCH_OPTIONS
239                 { "command-line",       1, 0, OPT_APPEND },
240                 { "append",             1, 0, OPT_APPEND },
241                 { "initrd",             1, 0, OPT_RAMDISK },
242                 { "ramdisk",            1, 0, OPT_RAMDISK },
243                 { "dtb",                1, 0, OPT_DTB },
244                 { "atags",              0, 0, OPT_ATAGS },
245                 { 0,                    0, 0, 0 },
246         };
247         static const char short_options[] = KEXEC_ARCH_OPT_STR "a:r:";
248
249         /*
250          * Parse the command line arguments
251          */
252         command_line = 0;
253         command_line_len = 0;
254         ramdisk = 0;
255         ramdisk_buf = 0;
256         initrd_size = 0;
257         use_atags = 0;
258         dtb_file = NULL;
259         while((opt = getopt_long(argc, argv, short_options, options, 0)) != -1) {
260                 switch(opt) {
261                 default:
262                         /* Ignore core options */
263                         if (opt < OPT_ARCH_MAX) {
264                                 break;
265                         }
266                 case '?':
267                         usage();
268                         return -1;
269                 case OPT_APPEND:
270                         command_line = optarg;
271                         break;
272                 case OPT_RAMDISK:
273                         ramdisk = optarg;
274                         break;
275                 case OPT_DTB:
276                         dtb_file = optarg;
277                         break;
278                 case OPT_ATAGS:
279                         use_atags = 1;
280                         break;
281                 }
282         }
283
284         if (use_atags && dtb_file) {
285                 fprintf(stderr, "You can only use ATAGs if you don't specify a "
286                         "dtb file.\n");
287                 return -1;
288         }
289
290         if (command_line) {
291                 command_line_len = strlen(command_line) + 1;
292                 if (command_line_len > COMMAND_LINE_SIZE)
293                         command_line_len = COMMAND_LINE_SIZE;
294         }
295         if (ramdisk) {//Read init ramdisk to memory.
296                 ramdisk_buf = slurp_file(ramdisk, &initrd_size);
297         }
298
299         /*
300          * If we are loading a dump capture kernel, we need to update kernel
301          * command line and also add some additional segments.
302          */
303         if (info->kexec_flags & KEXEC_ON_CRASH) {
304                 uint64_t start, end;
305
306                 modified_cmdline = xmalloc(COMMAND_LINE_SIZE);
307                 if (!modified_cmdline)
308                         return -1;
309
310                 if (command_line) {
311                         (void) strncpy(modified_cmdline, command_line,
312                                        COMMAND_LINE_SIZE);
313                         modified_cmdline[COMMAND_LINE_SIZE - 1] = '\0';
314                 }
315
316                 if (load_crashdump_segments(info, modified_cmdline) < 0) {
317                         free(modified_cmdline);
318                         return -1;
319                 }
320
321                 command_line = modified_cmdline;
322                 command_line_len = strlen(command_line) + 1;
323
324                 /*
325                  * We put the dump capture kernel at the start of crashkernel
326                  * reserved memory.
327                  */
328                 if (parse_iomem_single("Crash kernel\n", &start, &end)) {
329                         /*
330                          * No crash kernel memory reserved. We cannot do more
331                          * but just bail out.
332                          */
333                         return -1;
334                 }
335                 base = start;
336         } else {
337                 dbgprintf("len:%lx,offset:%ux,len+offset:%lx\n",len,offset,len+offset);
338                 base = locate_hole(info,len+offset,0,0,ULONG_MAX,INT_MAX);
339         }
303~314,321~322 handle the cmdline. 335 should be paid more attention. Now let's look at 316Line.
main->my_load->zImage_arm_load-> load_crashdump_segments
255 /**
256  * load_crashdump_segments() - loads additional segments needed for kdump
257  * @info: kexec info structure
258  * @mod_cmdline: kernel command line
259  *
260  * This function loads additional segments which are needed for the dump capture
261  * kernel. It also updates kernel command line passed in @mod_cmdline to have
262  * right parameters for the dump capture kernel.
263  *
264  * Return %0 in case of success and %-1 in case of error.
265  */
266 int load_crashdump_segments(struct kexec_info *info, char *mod_cmdline)
267 {
268         unsigned long elfcorehdr;
269         unsigned long bufsz;
270         void *buf;
271         int err;
272
273         /*
274          * First fetch all the memory (RAM) ranges that we are going to pass to
275          * the crashdump kernel during panic.
276          */
277         err = crash_get_memory_ranges();
main->my_load->zImage_arm_load-> load_crashdump_segments->crash_get_memory_ranges
155 static int crash_get_memory_ranges(void)
156 {
157         /*
158          * First read all memory regions that can be considered as
159          * system memory including the crash area.
160          */
161         kexec_iomem_for_each_line(NULL, crash_range_callback, NULL);
167
168         /*
169          * Exclude memory reserved for crashkernel (this may result a split memory
170          * region).
171          */
172         crash_exclude_range();
173
174         /*
175          * Make sure that the memory regions are sorted.
176          */
177         qsort(usablemem_rgns.ranges, usablemem_rgns.size,
178               sizeof(*usablemem_rgns.ranges), range_cmp);
179
180         return 0;
181 }
Line161, kexec_iomem_for_each_line() will:

1. Lookup "System RAM" in /proc/iomem, and stored at usablemem_rgns.ranges. Also usablemem_rgns.size++.

2. Lookup "Crash kernel" in /proc/iomem, and stored at crash_reserved_mem.
Then, Line 172, crash_exclude_range() will delete crash_reserved_mem range from "System RAM" range in usablemem_rgns.ranges. So, usablemem_rgns will splite two ranges.
main->my_load->zImage_arm_load-> load_crashdump_segments (cont'd)
281         /*
282          * Now that we have memory regions sorted, we can use first memory
283          * region as PHYS_OFFSET.
284          */
285         phys_offset = usablemem_rgns.ranges->start;
286         dbgprintf("phys_offset: %#lx\n", phys_offset);
287
288         err = crash_create_elf32_headers(info, &elf_info,
289                                          usablemem_rgns.ranges,
290                                          usablemem_rgns.size, &buf, &bufsz,
291                                          ELF_CORE_HEADER_ALIGN);
main->my_load->zImage_arm_load-> load_crashdump_segments-> crash_create_elf32_headers\(FUNC\)
elf_info is defined below.
48 static struct crash_elf_info elf_info = {
49         .class          = ELFCLASS32,
50         .data           = ELFDATA2LSB,
51         .machine        = EM_ARM,
52         .page_offset    = PAGE_OFFSET,
53 };
29 int FUNC(struct kexec_info *info,
30          struct crash_elf_info *elf_info,
31          struct memory_range *range, int ranges,
32          void **buf, unsigned long *size, unsigned long align) //buf and size is for return.
33 {
34         EHDR *elf;
35         PHDR *phdr;
36         int i;
37         unsigned long sz;
38         char *bufp;
39         long int nr_cpus = 0;
40         uint64_t notes_addr, notes_len;
41         uint64_t vmcoreinfo_addr, vmcoreinfo_len;
42         int has_vmcoreinfo = 0;
43         int (*get_note_info)(int cpu, uint64_t *addr, uint64_t *len);
44
45         if (xen_present())
46                 nr_cpus = xen_get_nr_phys_cpus();
47         else
48                 nr_cpus = sysconf(_SC_NPROCESSORS_CONF);
49
50         if (nr_cpus < 0) {
51                 return -1;
52         }
53
54         if (xen_present()) {
55                 if (!get_xen_vmcoreinfo(&vmcoreinfo_addr, &vmcoreinfo_len))
56                         has_vmcoreinfo = 1;
57         } else
58                 if (!get_kernel_vmcoreinfo(&vmcoreinfo_addr, &vmcoreinfo_len))
59                         has_vmcoreinfo = 1;
get_kernel_vmcoreinfo get vmcore information's address and length from /sys/kernel/vmcoreinfo. Let's see the detail below.
main->my_load->zImage_arm_load-> load_crashdump_segments-> crash_create_elf32_headers\(FUNC\)-> get_kernel_vmcoreinfo
139 /* Returns the physical address of start of crash notes buffer for a kernel. */
140 int get_kernel_vmcoreinfo(uint64_t *addr, uint64_t *len)
141 {
142         return get_vmcoreinfo("/sys/kernel/vmcoreinfo", addr, len);
143 }
main->my_load->zImage_arm_load-> load_crashdump_segments-> crash_create_elf32_headers\(FUNC\)-> get_kernel_vmcoreinfo-> get_vmcoreinfo
113 static int get_vmcoreinfo(const char *kdump_info, uint64_t *addr, uint64_t *len)
114 {
115         char line[MAX_LINE];
116         int count;
117         FILE *fp;
118         unsigned long long temp, temp2;
119
120         *addr = 0;
121         *len = 0;
122
123         if (!(fp = fopen(kdump_info, "r")))
124                 return -1;
125
126         if (!fgets(line, sizeof(line), fp))
127                 die("Cannot parse %s: %s\n", kdump_info, strerror(errno));
128         count = sscanf(line, "%Lx %Lx", &temp, &temp2);
129         if (count != 2)
130                 die("Cannot parse %s: %s\n", kdump_info, strerror(errno));
131
132         *addr = (uint64_t) temp;
133         *len = (uint64_t) temp2;
134
135         fclose(fp);
136         return 0;
137 }
main->my_load->zImage_arm_load-> load_crashdump_segments-> crash_create_elf32_headers\(FUNC\)  (cont'd)
61         sz = sizeof(EHDR) + (nr_cpus + has_vmcoreinfo) * sizeof(PHDR) +
62              ranges * sizeof(PHDR);
EHDR is `struct Elf32_Ehdr`. PHDR is program header. Now there are two memory ranges, and vmcore-info, So sz= sizeof(EHDR)+sizeof(PHDR)+2*sizeof(PHDR).
main->my_load->zImage_arm_load-> load_crashdump_segments-> crash_create_elf32_headers\(FUNC\)  (cont'd)
63
64         /*
65          * Certain architectures such as x86_64 and ia64 require a separate
66          * PT_LOAD program header for the kernel. This is controlled through
67          * elf_info->kern_size.
68          *
69          * The separate PT_LOAD program header is required either because the
70          * kernel is mapped at a different location than the rest of the
71          * physical memory or because we need to support relocatable kernels.
72          * Or both as on x86_64.
73          *
74          * In the relocatable kernel case this PT_LOAD segment is used to tell
75          * where the kernel was actually loaded which may be different from
76          * the load address present in the vmlinux file.
77          *
78          * The extra kernel PT_LOAD program header results in a vmcore file
79          * which is larger than the size of the physical memory. This is
80          * because the memory for the kernel is present both in the kernel
81          * PT_LOAD program header and in the physical RAM program headers.
82          */
83
84         if (elf_info->kern_size && !xen_present()) {
85                 sz += sizeof(PHDR);
86         }
elf_info->kern_size is zero here. Next, will fill elf header.
99         sz = _ALIGN(sz, align);
100
101         bufp = xmalloc(sz);
102         memset(bufp, 0, sz);
103
104         *buf = bufp;
105         *size = sz;
106
107         /* Setup ELF Header*/
108         elf = (EHDR *) bufp;
109         bufp += sizeof(EHDR);
110         memcpy(elf->e_ident, ELFMAG, SELFMAG);
111         elf->e_ident[EI_CLASS]  = elf_info->class;
112         elf->e_ident[EI_DATA]   = elf_info->data;
113         elf->e_ident[EI_VERSION]= EV_CURRENT;
114         elf->e_ident[EI_OSABI] = ELFOSABI_NONE;
115         memset(elf->e_ident+EI_PAD, 0, EI_NIDENT-EI_PAD);
116         elf->e_type     = ET_CORE;
117         elf->e_machine  = crash_architecture(elf_info);
118         elf->e_version  = EV_CURRENT;
119         elf->e_entry    = 0;
120         elf->e_phoff    = sizeof(EHDR);
121         elf->e_shoff    = 0;
122         elf->e_flags    = 0;
123         elf->e_ehsize   = sizeof(EHDR);
124         elf->e_phentsize= sizeof(PHDR);
125         elf->e_phnum    = 0;
126         elf->e_shentsize= 0;
127         elf->e_shnum    = 0;
128         elf->e_shstrndx = 0;
Then, get crash notes information by get_crash_notes_per_cpu. This function will read "/sys/devices/system/cpu/cpu0/crash_notes" and get address. Meanwhile, the length is 1024.
after get the cpu crash notes, then fill to corresponding program header.
141         for (i = 0; i < nr_cpus; i++) {
142                 if (get_note_info(i, &notes_addr, &notes_len) < 0) {
143                         /* This cpu is not present. Skip it. */
144                         continue;
145                 }
146
147                 phdr = (PHDR *) bufp;
148                 bufp += sizeof(PHDR);
149                 phdr->p_type   = PT_NOTE;
150                 phdr->p_flags   = 0;
151                 phdr->p_offset  = phdr->p_paddr = notes_addr;
152                 phdr->p_vaddr   = 0;
153                 phdr->p_filesz  = phdr->p_memsz = notes_len;
154                 /* Do we need any alignment of segments? */
155                 phdr->p_align   = 0;
156
157                 /* Increment number of program headers. */
158                 (elf->e_phnum)++;
159                 dbgprintf_phdr("Elf header", phdr);
160         }
After get cpu crash notes, it's turn to be vmcoreinfo. It get above and stored vmcoreinfo_addr and vmcoreinfo_len.
162         if (has_vmcoreinfo && !(info->kexec_flags & KEXEC_PRESERVE_CONTEXT)) {
163                 phdr = (PHDR *) bufp;
164                 bufp += sizeof(PHDR);
165                 phdr->p_type    = PT_NOTE;
166                 phdr->p_flags   = 0;
167                 phdr->p_offset  = phdr->p_paddr = vmcoreinfo_addr;
168                 phdr->p_vaddr   = 0;
169                 phdr->p_filesz  = phdr->p_memsz = vmcoreinfo_len;
170                 /* Do we need any alignment of segments? */
171                 phdr->p_align   = 0;
172
173                 (elf->e_phnum)++;
174                 dbgprintf_phdr("vmcoreinfo header", phdr);
175         }
Then, it's the time to set program header of Memory ranges.
194         /* Setup PT_LOAD type program header for every system RAM chunk.
195          * A seprate program header for Backup Region*/
196         for (i = 0; i < ranges; i++, range++) {
197                 unsigned long long mstart, mend;
198                 if (range->type != RANGE_RAM)
199                         continue;
200                 mstart = range->start;
201                 mend = range->end;
202                 if (!mstart && !mend)
203                         continue;
204                 phdr = (PHDR *) bufp;
205                 bufp += sizeof(PHDR);
206                 phdr->p_type    = PT_LOAD;
207                 phdr->p_flags   = PF_R|PF_W|PF_X;
208                 phdr->p_offset  = mstart;
209
210                 if (mstart == info->backup_src_start
211                     && (mend - mstart + 1) == info->backup_src_size)
212                         phdr->p_offset  = info->backup_start;
213
214                 /* We already prepared the header for kernel text. Map
215                  * rest of the memory segments to kernel linearly mapped
216                  * memory region.
217                  */
218                 phdr->p_paddr = mstart;
219                 phdr->p_vaddr = phys_to_virt(elf_info, mstart);// paddr + elf_info->page_offset - phys_offset
220                 phdr->p_filesz  = phdr->p_memsz = mend - mstart + 1;
221                 /* Do we need any alignment of segments? */
222                 phdr->p_align   = 0;
223
224                 /* HIGMEM has a virtual address of -1 */
225
226                 if (elf_info->lowmem_limit
227                     && (mend > (elf_info->lowmem_limit - 1)))
228                         phdr->p_vaddr = -1;
229
230                 /* Increment number of program headers. */
231                 (elf->e_phnum)++;
232                 dbgprintf_phdr("Elf header", phdr);
233         }
234         return 0;
235 }
Now, in load_crashdump_segments function, we has allocated ELF core header @ the end of the memory area reserved for the crashkernel.

From that, elf file is composed of
1. elf header
2. crash notes program header for pre CPU (Note: if you processor contains Hyper-Threading technology, the core is not equal to physical core. But, here, it's logic core. Get more information by http://en.wikipedia.org/wiki/Hyper-threading. By the way, for ARM processor, Hyper-Threading technology is not be applied). crash notes's stored address could be get by reading "/sys/devices/system/cpu/cpuN/crash_notes"
3. vmcore info program header. vmcore info stored address could be get by reading /sys/kernel/vmcoreinfo.
4. memory range program header. This is got from usablemem_rgns parameters.
main->my_load->zImage_arm_load-> load_crashdump_segments (cont'd)
292         if (err)
293                 return err;
294
295         /*
296          * We allocate ELF core header from the end of the memory area reserved
297          * for the crashkernel. We align the header to SECTION_SIZE (which is
298          * 1MB) so that available memory passed in kernel command line will be
299          * aligned to 1MB. This is because kernel create_mapping() wants memory
300          * regions to be aligned to SECTION_SIZE.
301          */
302         elfcorehdr = add_buffer_phys_virt(info, buf, bufsz, bufsz, 1 << 20,
303                                           crash_reserved_mem.start,
304                                           crash_reserved_mem.end, -1, 0);
305
306         dbgprintf("elfcorehdr: %#lx\n", elfcorehdr);
307         cmdline_add_elfcorehdr(mod_cmdline, elfcorehdr);
308
309         /*
310          * Add 'mem=size' parameter to dump capture kernel command line. This
311          * prevents the dump capture kernel from using any other memory regions
312          * which belong to the primary kernel.
313          */
314         cmdline_add_mem(mod_cmdline, elfcorehdr - crash_reserved_mem.start);
315
316         dump_memory_ranges();
317         dbgprintf("kernel command line: \"%s\"\n", mod_cmdline);
318
319         return 0;
320 }
302Line get elfcorehdr, which is pre-allocated for elf header and program header. Then add the "elfcorehdr=X" to cmdline, which is used to capture kernel. 314Line will add "mem=Z" to cmdline. The Z is equal to (Crash reserved memory size - 1MB).
main->my_load->zImage_arm_load-> load_crashdump_segments-> add_buffer_phys_virt
First, let's walk through the parameters. info is initialized in my_load. buf and bufsz is the elf header, program header buffer. memsz here is same to bufsz. buf_align is assigned by 1MB. buf_min and buf_max is reserved memory range. buf_end is -1, phys is 0.
357 unsigned long add_buffer_phys_virt(struct kexec_info *info,
358         const void *buf, unsigned long bufsz, unsigned long memsz,
359         unsigned long buf_align, unsigned long buf_min, unsigned long buf_max,
360         int buf_end, int phys)
361 {
362         unsigned long base;
363         int result;
364         int pagesize;
365
366         result = sort_segments(info);
367         if (result < 0) {
368                 die("sort_segments failed\n");
369         }
370
371         /* Round memsz up to a multiple of pagesize */
372         pagesize = getpagesize();
373         memsz = _ALIGN(memsz, pagesize);
374
375         base = locate_hole(info, memsz, buf_align, buf_min, buf_max, buf_end);
376         if (base == ULONG_MAX) {
377                 die("locate_hole failed\n");
378         }
379
380        add_segment_phys_virt(info, buf, bufsz, base, memsz, phys);
381         return base;
382 }
Here base is the last 1MB of reserved memory from crashkernel=x@y. Then add the segment to info by add_segment_phys_virt on Line 380.
main->my_load->zImage_arm_load(cont'd)
From 335Line, base is the start of reserved memory range.
340
341         if (base == ULONG_MAX)
342                 return -1;
343
344         /* assume the maximum kernel compression ratio is 4,
345          * and just to be safe, place ramdisk after that
346          */
347         initrd_base = base + len * 4;
348
349         if (use_atags) {
350                 /*
351                  * use ATAGs from /proc/atags
352                  */
353                 if (atag_arm_load(info, base + atag_offset,
354                                   command_line, command_line_len,
355                                   ramdisk_buf, initrd_size, initrd_base) == -1)
356                         return -1;
427         add_segment(info, buf, len, base + offset, len);
428
429         info->entry = (void*)base + offset;
430
431         return 0;
432 }

Now we should deal with the parameter "-atags" by atag_arm_load reading /proc/atags @353. Then add the atags to segments in info. At the end, add the kernel buffer into segments.
main->my_load:
777
778         result = kexec_load(
779                 info.entry, info.nr_segments, info.segment, info.kexec_flags);
788         return result;
789 }

After fill the elf, then add elf information, atags and kernel buffer to segments. Then pass the segments to sys_kexec_load system call.


Now, we have analyzed the routine of kexec loading in user space. Next, the kernel space code is analysed.

3. Routine of Kernel Mode


In kexec-tools source code, info->entry is be filled by crash memory reserved adding 32KB offset.

218 int zImage_arm_load(int argc, char **argv, const char *buf, off_t len,
219         struct kexec_info *info)
328                 if (parse_iomem_single("Crash kernel\n", &start, &end)) {
329                         /*
330                          * No crash kernel memory reserved. We cannot do more
331                          * but just bail out.
332                          */
333                         return -1;
334                 }
335                 base = start;
427         add_segment(info, buf, len, base + offset, len);
428
429         print_segment(stderr,info);
430         info->entry = (void*)base + offset;
}

For our case, the crashkernel =64MB@98MB. So, info->entry=64MB+32KB. If we use kdump as “kexec -p”, the flags is KEXEC_ARCH_ARM| KEXEC_ON_CRASH.
The parameter segments is allocated by kexec-tools for recording atags, ramdisk, kernel and elfcorehdr. Add atags,ramdisk,kernel buffer to info->segment[] call add_segment(). add elfcorehdr call add_segment_phys_virt(). The picture below illustrate the segments.


941 SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
 942                 struct kexec_segment __user *, segments, unsigned long, flags)
 943 {
 944         struct kimage **dest_image, *image;
 945         int result;
 …
 969         image = NULL;
 970         result = 0;
 971
 972         /* Because we write directly to the reserved memory
 973          * region when loading crash kernels we need a mutex here to
 974          * prevent multiple crash  kernels from attempting to load
 975          * simultaneously, and to prevent a crash kernel from loading
 976          * over the top of a in use crash kernel.
 977          *
 978          * KISS: always take the mutex.
 979          */
 980         if (!mutex_trylock(&kexec_mutex))
 981                 return -EBUSY;
 984         if (flags & KEXEC_ON_CRASH)
 985                 dest_image = &kexec_crash_image;
 986         if (nr_segments > 0) {
 987                 unsigned long i;
 988
 989                 /* Loading another kernel to reboot into */
 …
 994                 else if (flags & KEXEC_ON_CRASH) {
 995                         /* Free any current crash dump kernel before
 996                          * we corrupt it.
 997                          */
 998                         kimage_free(xchg(&kexec_crash_image, NULL));
 999                         result = kimage_crash_alloc(&image, entry,
1000                                                      nr_segments, segments);
1001                         crash_map_reserved_pages();//Null here.
1002                 }
1003                 if (result)
1004                         goto out;
1005
1006                 if (flags & KEXEC_PRESERVE_CONTEXT)
1007                         image->preserve_context = 1;
1008                 result = machine_kexec_prepare(image);
1009                 if (result)
1010                         goto out;
1011
1012                 for (i = 0; i < nr_segments; i++) {
1013                         result = kimage_load_segment(image, &image->segment[i]);
1014                         if (result)
1015                                 goto out;
1016                 }
1017                 kimage_terminate(image);
1018                 if (flags & KEXEC_ON_CRASH)
1019                         crash_unmap_reserved_pages();
1020         }
1021         /* Install the new kernel, and  Uninstall the old */
1022         image = xchg(dest_image, image);
1023
1024 out:
1025         mutex_unlock(&kexec_mutex);
1026         kimage_free(image);
1027
1028         return result;
1029 }

kexec_load-> kimage_crash_alloc
In this function, There are three steps below.
1.       Allocate and initializing a struct kimage object.(Corresponding with Line 999)
2.       Allocate a control page(The control page will be filled when soft reboot happens.)(Corresponding with Line 999)
3.       Copy buffer recording in info->segment[] to memory reserved range(1012~1016Line).

 272 static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry,
 273                                 unsigned long nr_segments,
 274                                 struct kexec_segment __user *segments)
 275 {
 276         int result;
 277         struct kimage *image;
 278         unsigned long i;
 279
 280         image = NULL;
 281         /* Verify we have a valid entry point */
 282         if ((entry < crashk_res.start) || (entry > crashk_res.end)) {
 283                 result = -EADDRNOTAVAIL;
 284                 goto out;
 285         }
 286
 287         /* Allocate and initialize a controlling structure */
 288         result = do_kimage_alloc(&image, entry, nr_segments, segments);
 289         if (result)
 290                 goto out;
 291
 292         /* Enable the special crash kernel control page
 293          * allocation policy.
 294          */
 295         image->control_page = crashk_res.start;
 296         image->type = KEXEC_TYPE_CRASH;
 297
 298         /*
 299          * Verify we have good destination addresses.  Normally
 300          * the caller is responsible for making certain we don't
 301          * attempt to load the new image into invalid or reserved
 302          * areas of RAM.  But crash kernels are preloaded into a
 303          * reserved area of ram.  We must ensure the addresses
 304          * are in the reserved area otherwise preloading the
 305          * kernel could corrupt things.
 306          */
 307         result = -EADDRNOTAVAIL;
 308         for (i = 0; i < nr_segments; i++) {
 309                 unsigned long mstart, mend;
 310
 311                 mstart = image->segment[i].mem;
 312                 mend = mstart + image->segment[i].memsz - 1;
 313                 /* Ensure we are within the crash kernel limits */
 314                 if ((mstart < crashk_res.start) || (mend > crashk_res.end))
 315                         goto out;
 316         }
 317
 318         /*
 319          * Find a location for the control code buffer, and add
 320          * the vector of segments so that it's pages will also be
 321          * counted as destination pages.
 322          */
 323         result = -ENOMEM;
 324         image->control_code_page = kimage_alloc_control_pages(image,
 325                                            get_order(KEXEC_CONTROL_PAGE_SIZE));
 326         if (!image->control_code_page) {
 327                 printk(KERN_ERR "Could not allocate control_code_buffer\n");
 328                 goto out;
 329         }
 330
 331         result = 0;
 332 out:
 333         if (result == 0)
 334                 *rimage = image;
 335         else
 336                 kfree(image);
 337
 338         return result;
 339 }

288Line allocate a struct kimage object and initialize it.

Line 308~316 will check segment’s memsz range. It won’t be out of crashkernel reserved memory range.

kexec_load-> kimage_crash_alloc-> do_kimage_alloc

118 static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
 119                             unsigned long nr_segments,
 120                             struct kexec_segment __user *segments)
 121 {
 122         size_t segment_bytes;
 123         struct kimage *image;
 124         unsigned long i;
 125         int result;
 126
 127         /* Allocate a controlling structure */
 128         result = -ENOMEM;
 129         image = kzalloc(sizeof(*image), GFP_KERNEL);
 130         if (!image)
 131                 goto out;
 132
 133         image->head = 0;
 134         image->entry = &image->head;
 135         image->last_entry = &image->head;
 136         image->control_page = ~0; /* By default this does not apply */
 137         image->start = entry;
 138         image->type = KEXEC_TYPE_DEFAULT;
 139
 140         /* Initialize the list of control pages */
 141         INIT_LIST_HEAD(&image->control_pages);
 142
 143         /* Initialize the list of destination pages */
 144         INIT_LIST_HEAD(&image->dest_pages);
 145
 146         /* Initialize the list of unusable pages */
 147         INIT_LIST_HEAD(&image->unuseable_pages);
 148
 149         /* Read in the segments */
 150         image->nr_segments = nr_segments;
 151         segment_bytes = nr_segments * sizeof(*segments);
 152         result = copy_from_user(image->segment, segments, segment_bytes);
 153         if (result) {
 154                 result = -EFAULT;
 155                 goto out;
 156         }
 157
 158         /*
 159          * Verify we have good destination addresses.  The caller is
 160          * responsible for making certain we don't attempt to load
 161          * the new image into invalid or reserved areas of RAM.  This
 162          * just verifies it is an address we can use.
 163          *
 164          * Since the kernel does everything in page size chunks ensure
 165          * the destination addresses are page aligned.  Too many
 166          * special cases crop of when we don't do this.  The most
 167          * insidious is getting overlapping destination addresses
 168          * simply because addresses are changed to page size
 169          * granularity.
 170          */
 171         result = -EADDRNOTAVAIL;
 172         for (i = 0; i < nr_segments; i++) {
 173                 unsigned long mstart, mend;
 174
 175                 mstart = image->segment[i].mem;
 176                 mend   = mstart + image->segment[i].memsz;
 177                 if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK))
 178                         goto out;
 179                 if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
 180                         goto out;
 181         }
 182
 183         /* Verify our destination addresses do not overlap.
 184          * If we alloed overlapping destination addresses
 185          * through very weird things can happen with no
 186          * easy explanation as one segment stops on another.
 187          */
 188         result = -EINVAL;
 189         for (i = 0; i < nr_segments; i++) {
 190                 unsigned long mstart, mend;
 191                 unsigned long j;
 192
 193                 mstart = image->segment[i].mem;
 194                 mend   = mstart + image->segment[i].memsz;
 195                 for (j = 0; j < i; j++) {
 196                         unsigned long pstart, pend;
 197                         pstart = image->segment[j].mem;
 198                         pend   = pstart + image->segment[j].memsz;
 199                         /* Do the segments overlap ? */
 200                         if ((mend > pstart) && (mstart < pend))
 201                                 goto out;
 202                 }
 203         }
 204
 205         /* Ensure our buffer sizes are strictly less than
 206          * our memory sizes.  This should always be the case,
 207          * and it is easier to check up front than to be surprised
 208          * later on.
 209          */
 210         result = -EINVAL;
 211         for (i = 0; i < nr_segments; i++) {
 212                 if (image->segment[i].bufsz > image->segment[i].memsz)
 213                         goto out;
 214         }
 215
 216         result = 0;
 217 out:
 218         if (result == 0)
 219                 *rimage = image;
 220         else
 221                 kfree(image);
 222
 223         return result;
 224
 225 }

kexec_load-> kimage_crash_alloc->kimage_alloc_control_pages-> kimage_alloc_crash_control_pages
There function complete step 2, which lists before.

467 static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
 468                                                       unsigned int order)
 469 {
 470         /* Control pages are special, they are the intermediaries
 471          * that are needed while we copy the rest of the pages
 472          * to their final resting place.  As such they must
 473          * not conflict with either the destination addresses
 474          * or memory the kernel is already using.
 475          *
 476          * Control pages are also the only pags we must allocate
 477          * when loading a crash kernel.  All of the other pages
 478          * are specified by the segments and we just memcpy
 479          * into them directly.
 480          *
 481          * The only case where we really need more than one of
 482          * these are for architectures where we cannot disable
 483          * the MMU and must instead generate an identity mapped
 484          * page table for all of the memory.
 485          *
 486          * Given the low demand this implements a very simple
 487          * allocator that finds the first hole of the appropriate
 488          * size in the reserved memory region, and allocates all
 489          * of the memory up to and including the hole.
 490          */
 491         unsigned long hole_start, hole_end, size;
 492         struct page *pages;
 493
 494         pages = NULL;
 495         size = (1 << order) << PAGE_SHIFT;
 496         hole_start = (image->control_page + (size - 1)) & ~(size - 1);
 497         hole_end   = hole_start + size - 1;
 498         while (hole_end <= crashk_res.end) {
 499                 unsigned long i;
 500
 501                 if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT)
 502                         break;
 503                 if (hole_end > crashk_res.end)
 504                         break;
 505                 /* See if I overlap any of the segments */
 506                 for (i = 0; i < image->nr_segments; i++) {
 507                         unsigned long mstart, mend;
 508
 509                         mstart = image->segment[i].mem;
 510                         mend   = mstart + image->segment[i].memsz - 1;
 511                         if ((hole_end >= mstart) && (hole_start <= mend)) {
 512                                 /* Advance the hole to the end of the segment */
 513                                 hole_start = (mend + (size - 1)) & ~(size - 1);
 514                                 hole_end   = hole_start + size - 1;
 515                                 break;
 516                         }
 517                 }
 518                 /* If I don't overlap any segments I have found my hole! */
 519                 if (i == image->nr_segments) {
 520                         pages = pfn_to_page(hole_start >> PAGE_SHIFT);
 521                         break;
 522                 }
 523         }
 524         if (pages)
 525                 image->control_page = hole_end;
 526
 527         return pages;
 528 }

Next, we will copy the buffer to memory which is allocated among crash memory reserved range. Complete step 3, which lists before.
kexec_load ->kimage_load_segment->kimage_load_crash_segment:
 845 static int kimage_load_crash_segment(struct kimage *image,
 846                                         struct kexec_segment *segment)
 847 {
 848         /* For crash dumps kernels we simply copy the data from
 849          * user space to it's destination.
 850          * We do things a page at a time for the sake of kmap.
 851          */
 852         unsigned long maddr;
 853         unsigned long ubytes, mbytes;
 854         int result;
 855         unsigned char __user *buf;
 856
 857         result = 0;
 858         buf = segment->buf;
 859         ubytes = segment->bufsz;
 860         mbytes = segment->memsz;
 861         maddr = segment->mem;
 862         while (mbytes) {
 863                 struct page *page;
 864                 char *ptr;
 865                 size_t uchunk, mchunk;
 866
 867                 page = pfn_to_page(maddr >> PAGE_SHIFT);
 868                 if (!page) {
 869                         result  = -ENOMEM;
 870                         goto out;
 871                 }
 872                 ptr = kmap(page);
 873                 ptr += maddr & ~PAGE_MASK;
 874                 mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
 875                 if (mchunk > mbytes)
 876                         mchunk = mbytes;
 877
 878                 uchunk = mchunk;
 879                 if (uchunk > ubytes) {
 880                         uchunk = ubytes;
 881                         /* Zero the trailing part of the page */
 882                         memset(ptr + uchunk, 0, mchunk - uchunk);
 883                 }
 884                 result = copy_from_user(ptr, buf, uchunk);
 885                 kexec_flush_icache_page(page);
 886                 kunmap(page);
 887                 if (result) {
 888                         result = -EFAULT;
 889                         goto out;
 890                 }
 891                 ubytes -= uchunk;
 892                 maddr  += mchunk;
 893                 buf    += mchunk;
 894                 mbytes -= mchunk;
 895         }
 896 out:
 897         return result;
 898 }

评论

此博客中的热门博文

Linux/ARM Page Table Entry 属性设置分析

提交了30次才AC ---【附】POJ 2488解题报告

笔记