Unstanding Kdump (Loading Part)
Table of Contents
1. Introduction2. Routine of User Mode
3. Routine of Kernel Mode
1. Introduction
kdump, based on kexec, is to debug Linux Kernel. Currently, kdump and kexec are integration in kexec-tools program. After compiling the program, please use "build/sbin/kexec" and "-l" parameter to execute kexec function. Instead, "-p" parameter is for kdump. More difference between kexec and kdump lists below.1. kexec's second kernel will overwrite first kernel.
2. If you want to use kdump, you should build the first kernel with "crashkernel=x@y" in cmdline. After second kernel bring up, you will found the cmdline is added, by kexec-tools, two parameters: "-mem=A" and "-elfcorehdr=B". "-mem" means the second kernel's available memory size. "-elfcorehdr" tell to kdump where could be got debug information.
Now, let's analyze the kdump load routine simply.
2. Routine of User Mode
1061 int main(int argc, char *argv[])
1062 {
1063         int do_load = 1;
1064         int do_exec = 0;
1065         int do_load_jump_back_helper = 0;
1066         int do_shutdown = 1;
1067         int do_sync = 1;
1068         int do_ifdown = 0;
1069         int do_unload = 0;
1070         int do_reuse_initrd = 0;
1071         void *entry = 0;
1072         char *type = 0;
1073         char *endptr;
1074         int opt;
1075         int result = 0;
1076         int fileind;
1077         static const struct option options[] = {
1078                 KEXEC_ALL_OPTIONS
1079                 { 0, 0, 0, 0},
1080         };
1081         static const char short_options[] = KEXEC_ALL_OPT_STR;
1082
1083         while ((opt = getopt_long(argc, argv, short_options,
1084                                   options, 0)) != -1) {
1085                 switch(opt) {
1086                 case '?':
1087                 case OPT_HELP:
1088                         usage();
1089                         return 0;
1090                 case OPT_VERSION:
…
1152                 case OPT_PANIC:
1153                         do_load = 1;
1154                         do_exec = 0;
1155                         do_shutdown = 0;
1156                         do_sync = 0;
1157                         kexec_flags = KEXEC_ON_CRASH;
1158                         break;
…
1187         if ((kexec_flags & KEXEC_ON_CRASH) && !is_crashkernel_mem_reserved()) {
1188                 printf("Memory for crashkernel is not reserved\n");
1189                 printf("Please reserve memory by passing ");
1190                 printf("\"crashkernel=X@Y\" parameter to the kernel\n");
1191                 die("Then try loading kdump kernel\n");
1192         }
1193
…
1201         fileind = optind;
1202         /* Reset getopt for the next pass; called in other source modules */
1203         opterr = 1;
1204         optind = 1;
1205
1206         result = arch_process_options(argc, argv);
…
1227         if (do_load && (result == 0)) {
1228                 result = my_load(type, fileind, argc, argv, kexec_flags, entry);
1229         }
…
main->my_load:
648 /*
649  *      Load the new kernel
650  */
651 static int my_load(const char *type, int fileind, int argc, char **argv,
652                    unsigned long kexec_flags, void *entry)
653 {
654         char *kernel;
655         char *kernel_buf;
656         off_t kernel_size;
657         int i = 0;
658         int result;
659         struct kexec_info info;
660         long native_arch;
661         int guess_only = 0;
662
 663         memset(&info, 0, sizeof(info));
 664         info.segment = NULL;
 665         info.nr_segments = 0;
 666         info.entry = NULL;
 667         info.backup_start = 0;
 668         info.kexec_flags = kexec_flags;
669
670         result = 0;
671         if (argc - fileind <= 0) {
672                 fprintf(stderr, "No kernel specified\n");
673                 usage();
674                 return -1;
675         }
676         kernel = argv[fileind];
677         /* slurp in the input kernel */
 678         kernel_buf = slurp_decompress_file(kernel, &kernel_size);
679
680         dbgprintf("kernel: %p kernel_size: %lx\n",
681                   kernel_buf, kernel_size);
682
683         if (get_memory_ranges(&info.memory_range, &info.memory_ranges,
 684                 info.kexec_flags) < 0 || info.memory_ranges == 0) {
685                 fprintf(stderr, "Could not get memory layout\n");
686                 return -1;
687         }
688         /* if a kernel type was specified, try to honor it */
689         if (type) {
690                 for (i = 0; i < file_types; i++) {
691                         if (strcmp(type, file_type[i].name) == 0)
692                                 break;
693                 }
694                 if (i == file_types) {
695                         fprintf(stderr, "Unsupported kernel type %s\n", type);
696                         return -1;
697                 } else {
698                         /* make sure our file is really of that type */
699                         if (file_type[i].probe(kernel_buf, kernel_size) < 0)
700                                 guess_only = 1;
701                 }
702         }
 703         if (!type || guess_only) {
704                 for (i = 0; i < file_types; i++) {
705                         if (file_type[i].probe(kernel_buf, kernel_size) >= 0)
706                                 break;
707                 }
708                 if (i == file_types) {
709                         fprintf(stderr, "Cannot determine the file type "
710                                         "of %s\n", kernel);
711                         return -1;
712                 } else {
713                         if (guess_only) {
714                                 fprintf(stderr, "Wrong file type %s, "
715                                         "file matches type %s\n",
716                                         type, file_type[i].name);
717                                 return -1;
718                         }
719                 }
720         }
721         /* Figure out our native architecture before load */
722         native_arch = physical_arch();
723         if (native_arch < 0) {
724                 return -1;
725         }
 726         info.kexec_flags |= native_arch;//Now there are two flags, KEXEC_ON_CRASH|KEXEC_ARCH_ARM
727
728         result = file_type[i].load(argc, argv, kernel_buf, kernel_size, &info);//0: success
main->my_load->zImage_arm_load:
218 int zImage_arm_load(int argc, char **argv, const char *buf, off_t len,
219         struct kexec_info *info)
220 {
221         unsigned long base;
222         unsigned int atag_offset = 0x1000; /* 4k offset from memory start */
223         unsigned int offset = 0x8000;      /* 32k offset from memory start */
224         const char *command_line;
225         char *modified_cmdline = NULL;
226         off_t command_line_len;
227         const char *ramdisk;
228         char *ramdisk_buf;
229         int opt;
230         int use_atags;
231         char *dtb_buf;
232         off_t dtb_length;
233         char *dtb_file;
234         off_t dtb_offset;
235         dbgprintf("buf:%p, len:%lx\n",buf,len);
236         /* See options.h -- add any more there, too. */
237         static const struct option options[] = {
238                 KEXEC_ARCH_OPTIONS
239                 { "command-line",       1, 0, OPT_APPEND },
240                 { "append",             1, 0, OPT_APPEND },
241                 { "initrd",             1, 0, OPT_RAMDISK },
242                 { "ramdisk",            1, 0, OPT_RAMDISK },
243                 { "dtb",                1, 0, OPT_DTB },
244                 { "atags",              0, 0, OPT_ATAGS },
245                 { 0,                    0, 0, 0 },
246         };
247         static const char short_options[] = KEXEC_ARCH_OPT_STR "a:r:";
248
249         /*
250          * Parse the command line arguments
251          */
252         command_line = 0;
253         command_line_len = 0;
254         ramdisk = 0;
255         ramdisk_buf = 0;
256         initrd_size = 0;
257         use_atags = 0;
258         dtb_file = NULL;
259         while((opt = getopt_long(argc, argv, short_options, options, 0)) != -1) {
260                 switch(opt) {
261                 default:
262                         /* Ignore core options */
263                         if (opt < OPT_ARCH_MAX) {
264                                 break;
265                         }
266                 case '?':
267                         usage();
268                         return -1;
269                 case OPT_APPEND:
270                         command_line = optarg;
271                         break;
272                 case OPT_RAMDISK:
273                         ramdisk = optarg;
274                         break;
275                 case OPT_DTB:
276                         dtb_file = optarg;
277                         break;
278                 case OPT_ATAGS:
279                         use_atags = 1;
280                         break;
281                 }
282         }
283
284         if (use_atags && dtb_file) {
285                 fprintf(stderr, "You can only use ATAGs if you don't specify a "
286                         "dtb file.\n");
287                 return -1;
288         }
289
290         if (command_line) {
291                 command_line_len = strlen(command_line) + 1;
292                 if (command_line_len > COMMAND_LINE_SIZE)
293                         command_line_len = COMMAND_LINE_SIZE;
294         }
295         if (ramdisk) {//Read init ramdisk to memory.
296                 ramdisk_buf = slurp_file(ramdisk, &initrd_size);
297         }
298
299         /*
300          * If we are loading a dump capture kernel, we need to update kernel
301          * command line and also add some additional segments.
302          */
303         if (info->kexec_flags & KEXEC_ON_CRASH) {
304                 uint64_t start, end;
305
306                 modified_cmdline = xmalloc(COMMAND_LINE_SIZE);
307                 if (!modified_cmdline)
308                         return -1;
309
310                 if (command_line) {
311                         (void) strncpy(modified_cmdline, command_line,
312                                        COMMAND_LINE_SIZE);
313                         modified_cmdline[COMMAND_LINE_SIZE - 1] = '\0';
314                 }
315
316                 if (load_crashdump_segments(info, modified_cmdline) < 0) {
317                         free(modified_cmdline);
318                         return -1;
319                 }
320
321                 command_line = modified_cmdline;
322                 command_line_len = strlen(command_line) + 1;
323
324                 /*
325                  * We put the dump capture kernel at the start of crashkernel
326                  * reserved memory.
327                  */
328                 if (parse_iomem_single("Crash kernel\n", &start, &end)) {
329                         /*
330                          * No crash kernel memory reserved. We cannot do more
331                          * but just bail out.
332                          */
333                         return -1;
334                 }
335                 base = start;
336         } else {
337                 dbgprintf("len:%lx,offset:%ux,len+offset:%lx\n",len,offset,len+offset);
338                 base = locate_hole(info,len+offset,0,0,ULONG_MAX,INT_MAX);
339         }
303~314,321~322 handle the cmdline. 335 should be paid more attention. Now let's look at 316Line.
main->my_load->zImage_arm_load-> load_crashdump_segments
255 /**
256  * load_crashdump_segments() - loads additional segments needed for kdump
257  * @info: kexec info structure
258  * @mod_cmdline: kernel command line
259  *
260  * This function loads additional segments which are needed for the dump capture
261  * kernel. It also updates kernel command line passed in @mod_cmdline to have
262  * right parameters for the dump capture kernel.
263  *
264  * Return %0 in case of success and %-1 in case of error.
265  */
266 int load_crashdump_segments(struct kexec_info *info, char *mod_cmdline)
267 {
268         unsigned long elfcorehdr;
269         unsigned long bufsz;
270         void *buf;
271         int err;
272
273         /*
274          * First fetch all the memory (RAM) ranges that we are going to pass to
275          * the crashdump kernel during panic.
276          */
277         err = crash_get_memory_ranges();
main->my_load->zImage_arm_load-> load_crashdump_segments->crash_get_memory_ranges
155 static int crash_get_memory_ranges(void)
156 {
157         /*
158          * First read all memory regions that can be considered as
159          * system memory including the crash area.
160          */
161         kexec_iomem_for_each_line(NULL, crash_range_callback, NULL);
…
167
168         /*
169          * Exclude memory reserved for crashkernel (this may result a split memory
170          * region).
171          */
172         crash_exclude_range();
173
174         /*
175          * Make sure that the memory regions are sorted.
176          */
177         qsort(usablemem_rgns.ranges, usablemem_rgns.size,
178               sizeof(*usablemem_rgns.ranges), range_cmp);
179
180         return 0;
181 }
Line161, kexec_iomem_for_each_line() will:
2. Lookup "Crash kernel" in /proc/iomem, and stored at crash_reserved_mem.
Then, Line 172, crash_exclude_range() will delete crash_reserved_mem range from "System RAM" range in usablemem_rgns.ranges. So, usablemem_rgns will splite two ranges.
main->my_load->zImage_arm_load-> load_crashdump_segments (cont'd)
281         /*
282          * Now that we have memory regions sorted, we can use first memory
283          * region as PHYS_OFFSET.
284          */
285         phys_offset = usablemem_rgns.ranges->start;
286         dbgprintf("phys_offset: %#lx\n", phys_offset);
287
288         err = crash_create_elf32_headers(info, &elf_info,
289                                          usablemem_rgns.ranges,
290                                          usablemem_rgns.size, &buf, &bufsz,
291                                          ELF_CORE_HEADER_ALIGN);
main->my_load->zImage_arm_load-> load_crashdump_segments-> crash_create_elf32_headers\(FUNC\)
elf_info is defined below.
48 static struct crash_elf_info elf_info = {
49         .class          = ELFCLASS32,
50         .data           = ELFDATA2LSB,
51         .machine        = EM_ARM,
52         .page_offset    = PAGE_OFFSET,
53 };
29 int FUNC(struct kexec_info *info,
30          struct crash_elf_info *elf_info,
31          struct memory_range *range, int ranges,
32          void **buf, unsigned long *size, unsigned long align) //buf and size is for return.
33 {
34         EHDR *elf;
35         PHDR *phdr;
36         int i;
37         unsigned long sz;
38         char *bufp;
39         long int nr_cpus = 0;
40         uint64_t notes_addr, notes_len;
41         uint64_t vmcoreinfo_addr, vmcoreinfo_len;
42         int has_vmcoreinfo = 0;
43         int (*get_note_info)(int cpu, uint64_t *addr, uint64_t *len);
44
45         if (xen_present())
46                 nr_cpus = xen_get_nr_phys_cpus();
47         else
48                 nr_cpus = sysconf(_SC_NPROCESSORS_CONF);
49
50         if (nr_cpus < 0) {
51                 return -1;
52         }
53
54         if (xen_present()) {
55                 if (!get_xen_vmcoreinfo(&vmcoreinfo_addr, &vmcoreinfo_len))
56                         has_vmcoreinfo = 1;
57         } else
58                 if (!get_kernel_vmcoreinfo(&vmcoreinfo_addr, &vmcoreinfo_len))
59                         has_vmcoreinfo = 1;
get_kernel_vmcoreinfo get vmcore information's address and length from /sys/kernel/vmcoreinfo. Let's see the detail below.
main->my_load->zImage_arm_load-> load_crashdump_segments-> crash_create_elf32_headers\(FUNC\)-> get_kernel_vmcoreinfo
139 /* Returns the physical address of start of crash notes buffer for a kernel. */
140 int get_kernel_vmcoreinfo(uint64_t *addr, uint64_t *len)
141 {
142         return get_vmcoreinfo("/sys/kernel/vmcoreinfo", addr, len);
143 }
main->my_load->zImage_arm_load-> load_crashdump_segments-> crash_create_elf32_headers\(FUNC\)-> get_kernel_vmcoreinfo-> get_vmcoreinfo
113 static int get_vmcoreinfo(const char *kdump_info, uint64_t *addr, uint64_t *len)
114 {
115         char line[MAX_LINE];
116         int count;
117         FILE *fp;
118         unsigned long long temp, temp2;
119
120         *addr = 0;
121         *len = 0;
122
123         if (!(fp = fopen(kdump_info, "r")))
124                 return -1;
125
126         if (!fgets(line, sizeof(line), fp))
127                 die("Cannot parse %s: %s\n", kdump_info, strerror(errno));
128         count = sscanf(line, "%Lx %Lx", &temp, &temp2);
129         if (count != 2)
130                 die("Cannot parse %s: %s\n", kdump_info, strerror(errno));
131
132         *addr = (uint64_t) temp;
133         *len = (uint64_t) temp2;
134
135         fclose(fp);
136         return 0;
137 }
main->my_load->zImage_arm_load-> load_crashdump_segments-> crash_create_elf32_headers\(FUNC\)  (cont'd)
61         sz = sizeof(EHDR) + (nr_cpus + has_vmcoreinfo) * sizeof(PHDR) +
62              ranges * sizeof(PHDR);
EHDR is `struct Elf32_Ehdr`. PHDR is program header. Now there are two memory ranges, and vmcore-info, So sz= sizeof(EHDR)+sizeof(PHDR)+2*sizeof(PHDR).
main->my_load->zImage_arm_load-> load_crashdump_segments-> crash_create_elf32_headers\(FUNC\)  (cont'd)
63
64         /*
65          * Certain architectures such as x86_64 and ia64 require a separate
66          * PT_LOAD program header for the kernel. This is controlled through
67          * elf_info->kern_size.
68          *
69          * The separate PT_LOAD program header is required either because the
70          * kernel is mapped at a different location than the rest of the
71          * physical memory or because we need to support relocatable kernels.
72          * Or both as on x86_64.
73          *
74          * In the relocatable kernel case this PT_LOAD segment is used to tell
75          * where the kernel was actually loaded which may be different from
76          * the load address present in the vmlinux file.
77          *
78          * The extra kernel PT_LOAD program header results in a vmcore file
79          * which is larger than the size of the physical memory. This is
80          * because the memory for the kernel is present both in the kernel
81          * PT_LOAD program header and in the physical RAM program headers.
82          */
83
84         if (elf_info->kern_size && !xen_present()) {
85                 sz += sizeof(PHDR);
86         }
elf_info->kern_size is zero here. Next, will fill elf header.
99         sz = _ALIGN(sz, align);
100
101         bufp = xmalloc(sz);
102         memset(bufp, 0, sz);
103
104         *buf = bufp;
105         *size = sz;
106
107         /* Setup ELF Header*/
108         elf = (EHDR *) bufp;
109         bufp += sizeof(EHDR);
110         memcpy(elf->e_ident, ELFMAG, SELFMAG);
111         elf->e_ident[EI_CLASS]  = elf_info->class;
112         elf->e_ident[EI_DATA]   = elf_info->data;
113         elf->e_ident[EI_VERSION]= EV_CURRENT;
114         elf->e_ident[EI_OSABI] = ELFOSABI_NONE;
115         memset(elf->e_ident+EI_PAD, 0, EI_NIDENT-EI_PAD);
116         elf->e_type     = ET_CORE;
117         elf->e_machine  = crash_architecture(elf_info);
118         elf->e_version  = EV_CURRENT;
119         elf->e_entry    = 0;
120         elf->e_phoff    = sizeof(EHDR);
121         elf->e_shoff    = 0;
122         elf->e_flags    = 0;
123         elf->e_ehsize   = sizeof(EHDR);
124         elf->e_phentsize= sizeof(PHDR);
125         elf->e_phnum    = 0;
126         elf->e_shentsize= 0;
127         elf->e_shnum    = 0;
128         elf->e_shstrndx = 0;
Then, get crash notes information by get_crash_notes_per_cpu. This function will read "/sys/devices/system/cpu/cpu0/crash_notes" and get address. Meanwhile, the length is 1024.
after get the cpu crash notes, then fill to corresponding program header.
141         for (i = 0; i < nr_cpus; i++) {
142                 if (get_note_info(i, ¬es_addr, ¬es_len) < 0) {
143                         /* This cpu is not present. Skip it. */
144                         continue;
145                 }
146
147                 phdr = (PHDR *) bufp;
148                 bufp += sizeof(PHDR);
149                 phdr->p_type   = PT_NOTE;
150                 phdr->p_flags   = 0;
151                 phdr->p_offset  = phdr->p_paddr = notes_addr;
152                 phdr->p_vaddr   = 0;
153                 phdr->p_filesz  = phdr->p_memsz = notes_len;
154                 /* Do we need any alignment of segments? */
155                 phdr->p_align   = 0;
156
157                 /* Increment number of program headers. */
158                 (elf->e_phnum)++;
159                 dbgprintf_phdr("Elf header", phdr);
160         }
After get cpu crash notes, it's turn to be vmcoreinfo. It get above and stored vmcoreinfo_addr and vmcoreinfo_len.
162         if (has_vmcoreinfo && !(info->kexec_flags & KEXEC_PRESERVE_CONTEXT)) {
163                 phdr = (PHDR *) bufp;
164                 bufp += sizeof(PHDR);
165                 phdr->p_type    = PT_NOTE;
166                 phdr->p_flags   = 0;
167                 phdr->p_offset  = phdr->p_paddr = vmcoreinfo_addr;
168                 phdr->p_vaddr   = 0;
169                 phdr->p_filesz  = phdr->p_memsz = vmcoreinfo_len;
170                 /* Do we need any alignment of segments? */
171                 phdr->p_align   = 0;
172
173                 (elf->e_phnum)++;
174                 dbgprintf_phdr("vmcoreinfo header", phdr);
175         }
Then, it's the time to set program header of Memory ranges.
194         /* Setup PT_LOAD type program header for every system RAM chunk.
195          * A seprate program header for Backup Region*/
196         for (i = 0; i < ranges; i++, range++) {
197                 unsigned long long mstart, mend;
198                 if (range->type != RANGE_RAM)
199                         continue;
200                 mstart = range->start;
201                 mend = range->end;
202                 if (!mstart && !mend)
203                         continue;
204                 phdr = (PHDR *) bufp;
205                 bufp += sizeof(PHDR);
206                 phdr->p_type    = PT_LOAD;
207                 phdr->p_flags   = PF_R|PF_W|PF_X;
208                 phdr->p_offset  = mstart;
209
210                 if (mstart == info->backup_src_start
211                     && (mend - mstart + 1) == info->backup_src_size)
212                         phdr->p_offset  = info->backup_start;
213
214                 /* We already prepared the header for kernel text. Map
215                  * rest of the memory segments to kernel linearly mapped
216                  * memory region.
217                  */
218                 phdr->p_paddr = mstart;
219                 phdr->p_vaddr = phys_to_virt(elf_info, mstart);// paddr + elf_info->page_offset - phys_offset
220                 phdr->p_filesz  = phdr->p_memsz = mend - mstart + 1;
221                 /* Do we need any alignment of segments? */
222                 phdr->p_align   = 0;
223
224                 /* HIGMEM has a virtual address of -1 */
225
226                 if (elf_info->lowmem_limit
227                     && (mend > (elf_info->lowmem_limit - 1)))
228                         phdr->p_vaddr = -1;
229
230                 /* Increment number of program headers. */
231                 (elf->e_phnum)++;
232                 dbgprintf_phdr("Elf header", phdr);
233         }
234         return 0;
235 }
Now, in load_crashdump_segments function, we has allocated ELF core header @ the end of the memory area reserved for the crashkernel.
From that, elf file is composed of 
1. elf header2. crash notes program header for pre CPU (Note: if you processor contains Hyper-Threading technology, the core is not equal to physical core. But, here, it's logic core. Get more information by http://en.wikipedia.org/wiki/Hyper-threading. By the way, for ARM processor, Hyper-Threading technology is not be applied). crash notes's stored address could be get by reading "/sys/devices/system/cpu/cpuN/crash_notes"
3. vmcore info program header. vmcore info stored address could be get by reading /sys/kernel/vmcoreinfo.
4. memory range program header. This is got from usablemem_rgns parameters.
main->my_load->zImage_arm_load-> load_crashdump_segments (cont'd)
292         if (err)
293                 return err;
294
295         /*
296          * We allocate ELF core header from the end of the memory area reserved
297          * for the crashkernel. We align the header to SECTION_SIZE (which is
298          * 1MB) so that available memory passed in kernel command line will be
299          * aligned to 1MB. This is because kernel create_mapping() wants memory
300          * regions to be aligned to SECTION_SIZE.
301          */
302         elfcorehdr = add_buffer_phys_virt(info, buf, bufsz, bufsz, 1 << 20,
303                                           crash_reserved_mem.start,
304                                           crash_reserved_mem.end, -1, 0);
305
306         dbgprintf("elfcorehdr: %#lx\n", elfcorehdr);
307         cmdline_add_elfcorehdr(mod_cmdline, elfcorehdr);
308
309         /*
310          * Add 'mem=size' parameter to dump capture kernel command line. This
311          * prevents the dump capture kernel from using any other memory regions
312          * which belong to the primary kernel.
313          */
314         cmdline_add_mem(mod_cmdline, elfcorehdr - crash_reserved_mem.start);
315
316         dump_memory_ranges();
317         dbgprintf("kernel command line: \"%s\"\n", mod_cmdline);
318
319         return 0;
320 }
302Line get elfcorehdr, which is pre-allocated for elf header and program header. Then add the "elfcorehdr=X" to cmdline, which is used to capture kernel. 314Line will add "mem=Z" to cmdline. The Z is equal to (Crash reserved  memory size - 1MB).
main->my_load->zImage_arm_load-> load_crashdump_segments-> add_buffer_phys_virt
First, let's walk through the parameters. info is initialized in my_load. buf and bufsz is the elf header, program header buffer. memsz here is same to bufsz. buf_align is assigned by 1MB. buf_min and buf_max is reserved memory  range. buf_end is -1, phys is 0.
357 unsigned long add_buffer_phys_virt(struct kexec_info *info,
358         const void *buf, unsigned long bufsz, unsigned long memsz,
359         unsigned long buf_align, unsigned long buf_min, unsigned long buf_max,
360         int buf_end, int phys)
361 {
362         unsigned long base;
363         int result;
364         int pagesize;
365
366         result = sort_segments(info);
367         if (result < 0) {
368                 die("sort_segments failed\n");
369         }
370
371         /* Round memsz up to a multiple of pagesize */
372         pagesize = getpagesize();
373         memsz = _ALIGN(memsz, pagesize);
374
375         base = locate_hole(info, memsz, buf_align, buf_min, buf_max, buf_end);
376         if (base == ULONG_MAX) {
377                 die("locate_hole failed\n");
378         }
379
380         add_segment_phys_virt(info, buf, bufsz, base, memsz, phys);
381         return base;
382 }
Here base is the last 1MB of reserved memory from crashkernel=x@y. Then add the segment to info by add_segment_phys_virt on Line 380. 
main->my_load->zImage_arm_load(cont'd)
From 335Line, base is the start of reserved memory range.
340
341         if (base == ULONG_MAX)
342                 return -1;
343
344         /* assume the maximum kernel compression ratio is 4,
345          * and just to be safe, place ramdisk after that
346          */
347         initrd_base = base + len * 4;
348
349         if (use_atags) {
350                 /*
351                  * use ATAGs from /proc/atags
352                  */
353                 if (atag_arm_load(info, base + atag_offset,
354                                   command_line, command_line_len,
355                                   ramdisk_buf, initrd_size, initrd_base) == -1)
356                         return -1;
…
427         add_segment(info, buf, len, base + offset, len);
428
429         info->entry = (void*)base + offset;
430
431         return 0;
432 }
Now we should deal with the parameter "-atags" by atag_arm_load reading /proc/atags @353. Then add the atags to segments in info. At the end, add the kernel buffer into segments.
main->my_load:
…
777
778         result = kexec_load(
779                 info.entry, info.nr_segments, info.segment, info.kexec_flags);
…
788         return result;
789 }
After fill the elf, then add elf information, atags and kernel buffer to segments. Then pass the segments to sys_kexec_load system call.
Now, we have analyzed the routine of kexec
loading in user space. Next, the kernel space code is analysed.
3. Routine of Kernel Mode
In kexec-tools source code, info->entry
is be filled by crash memory reserved adding 32KB offset. 
218 int zImage_arm_load(int argc, char
**argv, const char *buf, off_t len,
219        
struct kexec_info *info)
…
328                 if
(parse_iomem_single("Crash kernel\n", &start, &end)) {
329                         /*
330                          * No crash kernel
memory reserved. We cannot do more
331                          * but just bail out.
332                          */
333                         return -1;
334                 }
335                 base = start;
…
427        
add_segment(info, buf, len, base + offset, len);
428
429        
print_segment(stderr,info);
430        
info->entry = (void*)base + offset;
}
For our case, the crashkernel =64MB@98MB.
So, info->entry=64MB+32KB. If we use kdump as “kexec -p”, the flags is KEXEC_ARCH_ARM|
KEXEC_ON_CRASH. 
The parameter segments is allocated by
kexec-tools for recording atags, ramdisk, kernel and elfcorehdr. Add atags,ramdisk,kernel
buffer to info->segment[] call add_segment(). add elfcorehdr call
add_segment_phys_virt(). The picture below illustrate the segments.
941 SYSCALL_DEFINE4(kexec_load, unsigned
long, entry, unsigned long, nr_segments,
 942                 struct kexec_segment __user *,
segments, unsigned long, flags)
 943
{
 944        
struct kimage **dest_image, *image;
 945        
int result;
 …
 969        
image = NULL;
 970        
result = 0;
 971
 972        
/* Because we write directly to the reserved memory
 973         
* region when loading crash kernels we need a mutex here to
 974         
* prevent multiple crash  kernels
from attempting to load
 975         
* simultaneously, and to prevent a crash kernel from loading
 976         
* over the top of a in use crash kernel.
 977         
*
 978         
* KISS: always take the mutex.
 979         
*/
 980        
if (!mutex_trylock(&kexec_mutex))
 981                 return -EBUSY;
…
 984        
if (flags & KEXEC_ON_CRASH)
 985                 dest_image =
&kexec_crash_image;
 986        
if (nr_segments > 0) {
 987           
     unsigned long i;
 988
 989                 /* Loading another kernel to
reboot into */
 …
 994                 else if (flags &
KEXEC_ON_CRASH) {
 995                         /* Free any current
crash dump kernel before
 996                          * we corrupt it.
 997                          */
 998                        
kimage_free(xchg(&kexec_crash_image, NULL));
 999                         result =
kimage_crash_alloc(&image, entry,
1000                                                     
nr_segments, segments);
1001                        
crash_map_reserved_pages();//Null here.
1002                 }
1003                 if (result)
1004                         goto out;
1005
1006                 if (flags & KEXEC_PRESERVE_CONTEXT)
1007                        
image->preserve_context = 1;
1008                 result =
machine_kexec_prepare(image);
1009                 if (result)
1010                         goto out;
1011
1012                 for (i = 0; i < nr_segments; i++) {
1013                         result =
kimage_load_segment(image, &image->segment[i]);
1014                         if (result)
1015                                 goto out;
1016                 }
1017                 kimage_terminate(image);
1018                 if (flags &
KEXEC_ON_CRASH)
1019                        
crash_unmap_reserved_pages();
1020        
}
1021        
/* Install the new kernel, and 
Uninstall the old */
1022        
image = xchg(dest_image, image);
1023
1024 out:
1025        
mutex_unlock(&kexec_mutex);
1026        
kimage_free(image);
1027
1028        
return result;
1029 }
kexec_load-> kimage_crash_alloc
In this
function, There are three steps below.
1.      
Allocate and initializing a struct kimage object.(Corresponding with Line 999)
2.      
Allocate a control page(The control page will be filled when soft reboot happens.)(Corresponding
with Line 999)
3.      
Copy buffer recording in info->segment[] to memory reserved range(1012~1016Line).
 272 static int kimage_crash_alloc(struct
kimage **rimage, unsigned long entry,
 273                                 unsigned long
nr_segments,
 274                                 struct
kexec_segment __user *segments)
 275 {
 276        
int result;
 277        
struct kimage *image;
 278        
unsigned long i;
 279
 280        
image = NULL;
 281        
/* Verify we have a valid entry point */
 282        
if ((entry < crashk_res.start) || (entry > crashk_res.end)) {
 283                 result = -EADDRNOTAVAIL;
 284                 goto out;
 285        
}
 286
 287        
/* Allocate and initialize a controlling structure */
 288        
result = do_kimage_alloc(&image, entry, nr_segments, segments);
 289        
if (result)
 290                 goto out;
 291
 292        
/* Enable the special crash kernel control page
 293         
* allocation policy.
 294         
*/
 295        
image->control_page = crashk_res.start;
 296        
image->type = KEXEC_TYPE_CRASH;
 297
 298        
/*
 299         
* Verify we have good destination addresses.  Normally
 300         
* the caller is responsible for making certain we don't
 301         
* attempt to load the new image into invalid or reserved
 302         
* areas of RAM.  But crash kernels
are preloaded into a
 303         
* reserved area of ram.  We must
ensure the addresses
 304         
* are in the reserved area otherwise preloading the
 305         
* kernel could corrupt things.
 306         
*/
 307        
result = -EADDRNOTAVAIL;
 308        
for (i = 0; i < nr_segments; i++) {
 309                 unsigned long mstart, mend;
 310
 311                 mstart = image->segment[i].mem;
 312                 mend = mstart +
image->segment[i].memsz - 1;
 313                 /* Ensure we are within the
crash kernel limits */
 314                 if ((mstart <
crashk_res.start) || (mend > crashk_res.end))
 315                         goto out;
 316        
}
 317
 318        
/*
 319         
* Find a location for the control code buffer, and add
 320         
* the vector of segments so that it's pages will also be
 321         
* counted as destination pages.
 322         
*/
 323        
result = -ENOMEM;
 324        
image->control_code_page = kimage_alloc_control_pages(image,
 325                                           
get_order(KEXEC_CONTROL_PAGE_SIZE));
 326        
if (!image->control_code_page) {
 327                 printk(KERN_ERR "Could
not allocate control_code_buffer\n");
 328                 goto out;
 329        
}
 330
 331        
result = 0;
 332 out:
 333        
if (result == 0)
 334                 *rimage = image;
 335        
else
 336                 kfree(image);
 337
 338        
return result;
 339 }
288Line
allocate a struct kimage object and initialize it.
Line 308~316 will
check segment’s memsz range. It won’t be out of crashkernel reserved memory
range.
kexec_load-> kimage_crash_alloc-> do_kimage_alloc
118 static int do_kimage_alloc(struct
kimage **rimage, unsigned long entry,
 119                             unsigned long
nr_segments,
 120                             struct
kexec_segment __user *segments)
 121
{
 122        
size_t segment_bytes;
 123        
struct kimage *image;
 124        
unsigned long i;
 125        
int result;
 126
 127        
/* Allocate a controlling structure */
 128        
result = -ENOMEM;
 129        
image = kzalloc(sizeof(*image), GFP_KERNEL);
 130        
if (!image)
 131                 goto out;
 132
 133        
image->head = 0;
 134      
  image->entry =
&image->head;
 135        
image->last_entry = &image->head;
 136        
image->control_page = ~0; /* By default this does not apply */
 137        
image->start = entry;
 138        
image->type = KEXEC_TYPE_DEFAULT;
 139
 140        
/* Initialize the list of control pages */
 141        
INIT_LIST_HEAD(&image->control_pages);
 142
 143        
/* Initialize the list of destination pages */
 144        
INIT_LIST_HEAD(&image->dest_pages);
 145
 146        
/* Initialize the list of unusable pages */
 147        
INIT_LIST_HEAD(&image->unuseable_pages);
 148
 149        
/* Read in the segments */
 150        
image->nr_segments = nr_segments;
 151        
segment_bytes = nr_segments * sizeof(*segments);
 152        
result = copy_from_user(image->segment, segments, segment_bytes);
 153        
if (result) {
 154                 result = -EFAULT;
 155                 goto out;
 156        
}
 157
 158        
/*
 159         
* Verify we have good destination addresses.  The caller is
 160         
* responsible for making certain we don't attempt to load
 161         
* the new image into invalid or reserved areas of RAM.  This
 162         
* just verifies it is an address we can use.
 163         
*
 164         
* Since the kernel does everything in page size chunks ensure
 165         
* the destination addresses are page aligned.  Too many
 166         
* special cases crop of when we don't do this.  The most
 167         
* insidious is getting overlapping destination addresses
 168         
* simply because addresses are changed to page size
 169         
* granularity.
 170         
*/
 171        
result = -EADDRNOTAVAIL;
 172        
for (i = 0; i < nr_segments; i++) {
 173                 unsigned long mstart, mend;
 174
 175                 mstart =
image->segment[i].mem;
 176                 mend   = mstart + image->segment[i].memsz;
 177                 if ((mstart & ~PAGE_MASK)
|| (mend & ~PAGE_MASK))
 178                         goto out;
 179                 if (mend >=
KEXEC_DESTINATION_MEMORY_LIMIT)
 180                         goto out;
 181        
}
 182
 183        
/* Verify our destination addresses do not overlap.
 184         
* If we alloed overlapping destination addresses
 185         
* through very weird things can happen with no
 186         
* easy explanation as one segment stops on another.
 187         
*/
 188        
result = -EINVAL;
 189        
for (i = 0; i < nr_segments; i++) {
 190                 unsigned long mstart, mend;
 191      
          unsigned long j;
 192
 193                 mstart =
image->segment[i].mem;
 194                 mend   = mstart + image->segment[i].memsz;
 195                 for (j = 0; j < i; j++) {
 196                         unsigned long pstart,
pend;
 197                         pstart =
image->segment[j].mem;
 198                         pend   = pstart + image->segment[j].memsz;
 199                         /* Do the segments
overlap ? */
 200                         if ((mend > pstart)
&& (mstart < pend))
 201                                 goto out;
 202                 }
 203        
}
 204
 205        
/* Ensure our buffer sizes are strictly less than
 206         
* our memory sizes.  This should
always be the case,
 207         
* and it is easier to check up front than to be surprised
 208         
* later on.
 209         
*/
 210        
result = -EINVAL;
 211        
for (i = 0; i < nr_segments; i++) {
 212                 if (image->segment[i].bufsz
> image->segment[i].memsz)
 213                         goto out;
 214        
}
 215
 216        
result = 0;
 217
out:
 218        
if (result == 0)
 219                 *rimage = image;
 220        
else
 221                 kfree(image);
 222
 223        
return result;
 224
 225
}
kexec_load-> kimage_crash_alloc->kimage_alloc_control_pages-> kimage_alloc_crash_control_pages
There function complete step 2, which lists
before.
467 static struct page
*kimage_alloc_crash_control_pages(struct kimage *image,
 468                                                      
unsigned int order)
 469
{
 470        
/* Control pages are special, they are the intermediaries
 471         
* that are needed while we copy the rest of the pages
 472         
* to their final resting place. 
As such they must
 473         
* not conflict with either the destination addresses
 474         
* or memory the kernel is already using.
 475         
*
 476         
* Control pages are also the only pags we must allocate
 477         
* when loading a crash kernel. 
All of the other pages
 478         
* are specified by the segments and we just memcpy
 479         
* into them directly.
 480         
*
 481         
* The only case where we really need more than one of
 482         
* these are for architectures where we cannot disable
 483         
* the MMU and must instead generate an identity mapped
 484         
* page table for all of the memory.
 485         
*
 486         
* Given the low demand this implements a very simple
 487          * allocator that finds the first hole
of the appropriate
 488         
* size in the reserved memory region, and allocates all
 489         
* of the memory up to and including the hole.
 490         
*/
 491        
unsigned long hole_start, hole_end, size;
 492        
struct page *pages;
 493
 494        
pages = NULL;
 495        
size = (1 << order) << PAGE_SHIFT;
 496        
hole_start = (image->control_page + (size - 1)) & ~(size - 1);
 497       
 hole_end   = hole_start + size - 1;
 498         while (hole_end <= crashk_res.end)
{
 499                 unsigned long i;
 500
 501                 if (hole_end >
KEXEC_CRASH_CONTROL_MEMORY_LIMIT)
 502                         break;
 503                 if (hole_end > crashk_res.end)
 504                         break;
 505                 /* See if I overlap any of the
segments */
 506                 for (i = 0; i <
image->nr_segments; i++) {
 507                         unsigned long mstart,
mend;
 508
 509                         mstart =
image->segment[i].mem;
 510                         mend   = mstart + image->segment[i].memsz - 1;
 511                         if ((hole_end >=
mstart) && (hole_start <= mend)) {
 512                                 /* Advance the hole to the end of the segment
*/
 513                                 hole_start =
(mend + (size - 1)) & ~(size - 1);
 514                                 hole_end   = hole_start + size - 1;
 515                                 break;
 516                         }
 517                 }
 518                 /* If I don't overlap any
segments I have found my hole! */
 519                 if (i ==
image->nr_segments) {
 520                         pages =
pfn_to_page(hole_start >> PAGE_SHIFT);
 521                         break;
 522                 }
 523        
}
 524        
if (pages)
 525                 image->control_page =
hole_end;
 526
 527        
return pages;
 528
}
Next, we will copy the buffer to memory
which is allocated among crash memory reserved range. Complete step 3, which
lists before.
kexec_load ->kimage_load_segment->kimage_load_crash_segment:
 845
static int kimage_load_crash_segment(struct kimage *image,
 846                                         struct
kexec_segment *segment)
 847
{
 848        
/* For crash dumps kernels we simply copy the data from
 849         
* user space to it's destination.
 850         
* We do things a page at a time for the sake of kmap.
 851         
*/
 852        
unsigned long maddr;
 853        
unsigned long ubytes, mbytes;
 854        
int result;
 855        
unsigned char __user *buf;
 856
 857         result = 0;
 858        
buf = segment->buf;
 859        
ubytes = segment->bufsz;
 860        
mbytes = segment->memsz;
 861        
maddr = segment->mem;
 862         while (mbytes) {
 863                 struct page *page;
 864                 char *ptr;
 865                 size_t uchunk, mchunk;
 866
 867                 page = pfn_to_page(maddr
>> PAGE_SHIFT);
 868                 if (!page) {
 869                         result  = -ENOMEM;
 870                         goto out;
 871                 }
 872                 ptr = kmap(page);
 873                 ptr += maddr & ~PAGE_MASK;
 874                 mchunk = PAGE_SIZE - (maddr
& ~PAGE_MASK);
 875                 if (mchunk > mbytes)
 876                         mchunk = mbytes;
 877
 878                 uchunk = mchunk;
 879                 if (uchunk > ubytes) {
 880                         uchunk = ubytes;
 881                         /* Zero the trailing
part of the page */
 882                         memset(ptr + uchunk,
0, mchunk - uchunk);
 883                 }
 884                 result = copy_from_user(ptr,
buf, uchunk);
 885                 kexec_flush_icache_page(page);
 886                 kunmap(page);
 887                 if (result) {
 888          
              result = -EFAULT;
 889                         goto out;
 890                 }
 891                 ubytes -= uchunk;
 892                 maddr  += mchunk;
 893                 buf    += mchunk;
 894                 mbytes -= mchunk;
 895        
}
 896
out:
 897        
return result;
 898
}

 
评论
发表评论