说Kexec是基于kexec机制工作的,但关于Kdump到底是怎么实现的, 比如将第二个内核怎么加载到具体的保留位置,第一个内核crash后怎么传需要的elfcorehdr和memmap参数给第二个内核,另外第二个内核是怎么调用makdedumpfile来过滤压缩页的,网上一些资料给的都太概括了,还没找到相关分析的,看了下代码,有了个大概,可能部分理解有误,欢迎拍砖和探讨.
先看一张图,这个是网上找到的Vivek Goyal的PPT中两幅图,这里合成一张了
KEXEC的设计是用新内核去覆盖原内核位置;而KDUMP是预留一块内存来加载第二个内核(和相关数据),Crash后第二个内核在原位置运行(不然就达不到相关目的了),收集第一个内核的相关内存信息。在KDUMP中Kexec算是一个引导器,类似GRUB(2). 真正的实现是在kexec-tools中,对于RH系列,相关的kexec-tools RPM包中除了封装相关程序外,还有个/etc/rc.d/init.d/kdump shell脚本来负责将相关工具粘在一起
下面来说下大致流程: 1).第一个内核以crashkernel启动后,内核解析此crashkernel命令行选项并将此选项值放到crash_res中,并预留相关内存区域
/* crashkernel=size@addr specifies the location to reserve for * a crash kernel. By reserving this memory we guarantee * that linux never sets it up as a DMA target. * Useful for holding code to do something appropriate * after a kernel panic. */
/* Location of the reserved area for the crash kernel */
struct resource crashk_res = { //参见crash_res定义!
.name = "Crash kernel", .start = 0, .end = 0, .flags = IORESOURCE_BUSY | IORESOURCE_MEM };//这里存放的应该是命令行和/proc/iomem看到的0x1000000-0x7ffffff : Crash kernel static int __init parse_crashkernel(char *arg) { unsigned long size, base; size = memparse(arg, &arg); if (*arg == '@') { base = memparse(arg+1, &arg); /* FIXME: Do I want a sanity check * to validate the memory range? */ crashk_res.start = base;//存到此处 crashk_res.end = base + size - 1; } return 0; } early_param("crashkernel", parse_crashkernel);//将与crashkernel关联的parse_crashkernel放到.init.setup中
| /etc/init.d/kdump start启动时(只摘录部分相关的)
function save_core() { local kdump_path kdump_path=`grep ^path $KDUMP_CONFIG_FILE | cut -d' ' -f2-` if [ -z "$kdump_path" ]; then coredir="/var/crash/`date +"%Y-%m-%d-%H:%M"`" else coredir="${kdump_path}/`date +"%Y-%m-%d-%H:%M"`" fi
mkdir -p $coredir cp --sparse=always /proc/vmcore $coredir/vmcore-incomplete exitcode=$? if [ $exitcode == 0 ]; then mv $coredir/vmcore-incomplete $coredir/vmcore $LOGGER "saved a vmcore to $coredir" else $LOGGER "failed to save a vmcore to $coredir" fi return $exitcode } function load_kdump() {
if [ -z "$KDUMP_COMMANDLINE" ] then KDUMP_COMMANDLINE=`cat /proc/cmdline` fi
ARCH=`uname -m` if [ "$ARCH" == "ppc64" ] then MEM_RESERVED=`grep "crashkernel=[0-9]\+[MmKkGg]@[0-9]\+[MmGgKk]" /proc/cmdline` else MEM_RESERVED=`grep "Crash kernel" /proc/iomem | grep -v "00000000-00000000"` fi if [ -z "$MEM_RESERVED" ] then $LOGGER "No crashkernel parameter specified for running kernel" return 1 fi
if [ "$ARCH" == "i686" -o "$ARCH" == "i386" ] then
need_64bit_headers if [ $? == 1 ] then FOUND_ELF_ARGS=`echo $KEXEC_ARGS | grep elf32-core-headers` if [ -n "$FOUND_ELF_ARGS" ] then echo -n "Warning: elf32-core-headers overrides correct elf64 setting" warning echo else KEXEC_ARGS="$KEXEC_ARGS --elf64-core-headers" fi else FOUND_ELF_ARGS=`echo $KEXEC_ARGS | grep elf64-core-headers` if [ -z "$FOUND_ELF_ARGS" ] then KEXEC_ARGS="$KEXEC_ARGS --elf32-core-headers" fi fi fi
KDUMP_COMMANDLINE=`echo $KDUMP_COMMANDLINE | sed -e 's/crashkernel=[0-9]\+[MmKkGg]@[0-9]\+[MmGgKk]//'` KDUMP_COMMANDLINE=`echo $KDUMP_COMMANDLINE | sed -e's/mem=[0-9]\+[GMKgmk]* *//'` KDUMP_COMMANDLINE=`echo $KDUMP_COMMANDLINE | sed -e's/hugepages=[0-9]\+ */ /g' -e's/hugepagesz=[0-9]\+[kKmMgG]* */ /g'`
KDUMP_COMMANDLINE="${KDUMP_COMMANDLINE} ${KDUMP_COMMANDLINE_APPEND}" avoid_cdrom_drive KDUMP_COMMANDLINE="${KDUMP_COMMANDLINE} ${KDUMP_IDE_NOPROBE_COMMANDLINE}"
#最主要的是这部分
KEXEC_OUTPUT=`$KEXEC $KEXEC_ARGS $standard_kexec_args \ --command-line="$KDUMP_COMMANDLINE" \ --initrd=$kdump_initrd $kdump_kernel 2>&1` if [ $? == 0 ]; then $LOGGER "kexec: loaded kdump kernel" return 0 else $LOGGER $KEXEC_OUTPUT $LOGGER "kexec: failed to load kdump kernel" return 1 fi }
function start() { #TODO check raw partition for core dump image
status rc=$? if [ $rc == 2 ]; then echo -n "Kdump is not supported on this kernel"; failure; echo return 1; else if [ $rc == 0 ]; then echo -n "Kdump already running"; success; echo return 0 fi fi check_config if [ $? != 0 ]; then echo -n "Starting kdump:"; failure; echo $LOGGER "failed to start up, config file incorrect" return 1 fi load_kdump if [ $? != 0 ]; then echo -n "Starting kdump:"; failure; echo $LOGGER "failed to start up" return 1 fi
echo -n "Starting kdump:"; success; echo $LOGGER "started up" }
case "$1" in start) if [ -s /proc/vmcore ]; then #第二个内核启动后走此步!
run_kdump_pre save_core run_kdump_post $? do_final_action else #刚开始走此步!
start fi ;;
| 最后是调用如下形式
kexec --args-linux --elf32(64)-core-headers -p --command-line="$KDUMP_COMMANDLINE" --initrd=$kdump_initrd $kdump_kernel
其中commandline是在配置文件中手动设置的或者从/proc/cmdline得到
| 这个就到了上次分析kexec的代码了,注意此处是以-p来调用的
int elf_x86_load(int argc, char **argv, const char *buf, off_t len, struct kexec_info *info)//******************
{ struct mem_ehdr ehdr; const char *command_line; char *modified_cmdline; int command_line_len; int modified_cmdline_len; const char *ramdisk; unsigned long entry, max_addr; int arg_style; #define ARG_STYLE_ELF 0 #define ARG_STYLE_LINUX 1 #define ARG_STYLE_NONE 2 int opt; #define OPT_APPEND (OPT_ARCH_MAX+0) #define OPT_REUSE_CMDLINE (OPT_ARCH_MAX+1) #define OPT_RAMDISK (OPT_ARCH_MAX+2) #define OPT_ARGS_ELF (OPT_ARCH_MAX+3) #define OPT_ARGS_LINUX (OPT_ARCH_MAX+4) #define OPT_ARGS_NONE (OPT_ARCH_MAX+5)
static const struct option options[] = {//参见http:///blog/tag/getopt_long/ noted by peter.guo
KEXEC_ARCH_OPTIONS { "command-line", 1, NULL, OPT_APPEND }, { "append", 1, NULL, OPT_APPEND }, { "reuse-cmdline", 1, NULL, OPT_REUSE_CMDLINE }, { "initrd", 1, NULL, OPT_RAMDISK }, { "ramdisk", 1, NULL, OPT_RAMDISK }, { "args-elf", 0, NULL, OPT_ARGS_ELF }, { "args-linux", 0, NULL, OPT_ARGS_LINUX }, { "args-none", 0, NULL, OPT_ARGS_NONE }, { 0, 0, NULL, 0 }, };
static const char short_options[] = KEXEC_OPT_STR "";
/* * Parse the command line arguments */ arg_style = ARG_STYLE_ELF; command_line = 0; modified_cmdline = 0; modified_cmdline_len = 0; ramdisk = 0; while((opt = getopt_long(argc, argv, short_options, options, 0)) != -1) {
//属于gnu体系
switch(opt) { default: /* Ignore core options */ if (opt < OPT_ARCH_MAX) { break; } case '?': usage(); return -1; case OPT_APPEND://进入此! command_line = optarg; break; case OPT_REUSE_CMDLINE: command_line = get_command_line(); break; case OPT_RAMDISK: //进入此!
ramdisk = optarg; break; case OPT_ARGS_ELF: arg_style = ARG_STYLE_ELF; break; case OPT_ARGS_LINUX://进入此!
arg_style = ARG_STYLE_LINUX; break; case OPT_ARGS_NONE: #ifdef __i386__ arg_style = ARG_STYLE_NONE; #else die("--args-none only works on arch i386\n"); #endif break; } } command_line_len = 0; if (command_line) { command_line_len = strlen(command_line) +1; }
/* Need to append some command line parameters internally in case of * taking crash dumps. */ if (info->kexec_flags & (KEXEC_ON_CRASH|KEXEC_PRESERVE_CONTEXT)) { modified_cmdline = xmalloc(COMMAND_LINE_SIZE);//分配一个新的空间来盛命令行!
memset((void *)modified_cmdline, 0, COMMAND_LINE_SIZE); if (command_line) { strncpy(modified_cmdline, command_line, COMMAND_LINE_SIZE); modified_cmdline[COMMAND_LINE_SIZE - 1] = '\0'; } modified_cmdline_len = strlen(modified_cmdline); }
/* Load the ELF executable */ elf_exec_build_load(info, &ehdr, buf, len, 0);//========================>
entry = ehdr.e_entry; max_addr = elf_max_addr(&ehdr);
/* Do we want arguments? */ if (arg_style != ARG_STYLE_NONE) {//=====>
/* Load the setup code *///===========>pay more attention to purgatory!!!!!!
elf_rel_build_load(info, &info->rhdr, (char *) purgatory, purgatory_size, 0, ULONG_MAX, 1, 0); } if (arg_style == ARG_STYLE_NONE) { info->entry = (void *)entry;
} else if (arg_style == ARG_STYLE_ELF) { unsigned long note_base; struct entry32_regs regs; uint32_t arg1, arg2;
/* Setup the ELF boot notes */ note_base = elf_boot_notes(info, max_addr, (unsigned char *) command_line, command_line_len);
/* Initialize the stack arguments */ arg2 = 0; /* No return address */ arg1 = note_base; elf_rel_set_symbol(&info->rhdr, "stack_arg32_1", &arg1, sizeof(arg1)); elf_rel_set_symbol(&info->rhdr, "stack_arg32_2", &arg2, sizeof(arg2)); /* Initialize the registers */ elf_rel_get_symbol(&info->rhdr, "entry32_regs", ®s, sizeof(regs)); regs.eip = entry; /* The entry point */ regs.esp = elf_rel_get_addr(&info->rhdr, "stack_arg32_2"); elf_rel_set_symbol(&info->rhdr, "entry32_regs", ®s, sizeof(regs));
if (ramdisk) { die("Ramdisks not supported with generic elf arguments"); } } else if (arg_style == ARG_STYLE_LINUX) {//=====>got it !!!!!!
struct x86_linux_faked_param_header *hdr; unsigned long param_base; const unsigned char *ramdisk_buf; off_t ramdisk_length; struct entry32_regs regs; int rc = 0;
/* Get the linux parameter header */ hdr = xmalloc(sizeof(*hdr));
/* Hack: With some ld versions, vmlinux program headers show * a gap of two pages between bss segment and data segment * but effectively kernel considers it as bss segment and * overwrites the any data placed there. Hence bloat the * memsz of parameter segment to 16K to avoid being placed * in such gaps. * This is a makeshift solution until it is fixed in kernel */ param_base = add_buffer(info, hdr, sizeof(*hdr), 16*1024, 16, 0, max_addr, 1);
/* Initialize the parameter header */ memset(hdr, 0, sizeof(*hdr)); init_linux_parameters(&hdr->hdr);
/* Add a ramdisk to the current image */ ramdisk_buf = NULL; ramdisk_length = 0; if (ramdisk) { ramdisk_buf = (unsigned char *) slurp_file(ramdisk, &ramdisk_length); }
/* If panic kernel is being loaded, additional segments need * to be created. */ if (info->kexec_flags & (KEXEC_ON_CRASH|KEXEC_PRESERVE_CONTEXT)) {
/*
Command line: ro root=LABEL=/ rhgb quiet irqpoll maxcpus=1 reset_devices memmap=exactmap memmap=640K@0K memmap=5264K@16384K memmap=125152K@22288K elfcorehdr=147440K (0x8ffc000) memmap=56K#1834688K memmap=136K#1834744K memmap=128K#1834880K memmap=1024K$4193280K //红色部分'#'代表specific memory forACPI data. '$'代表specific memory as reserved. 没在代码中查找到?
/此处得到相关的memmap和elfcorehdr参数并存入新的命令行参数中
*/
rc = load_crashdump_segments(info, modified_cmdline, max_addr, 0); if (rc < 0) return -1; /* Use new command line. */ command_line = modified_cmdline; command_line_len = strlen(modified_cmdline) + 1; }
/* Tell the kernel what is going on */ setup_linux_bootloader_parameters(info, &hdr->hdr, param_base, offsetof(struct x86_linux_faked_param_header, command_line), command_line, command_line_len, ramdisk_buf, ramdisk_length);//======>got it !!!!!!
/* Fill in the information bios calls would usually provide */ setup_linux_system_parameters(&hdr->hdr, info->kexec_flags);
/* Initialize the registers */ elf_rel_get_symbol(&info->rhdr, "entry32_regs", ®s, sizeof(regs)); regs.ebx = 0; /* Bootstrap processor */ regs.esi = param_base; /* Pointer to the parameters */ regs.eip = entry; /* The entry point */ regs.esp = elf_rel_get_addr(&info->rhdr, "stack_end"); /* Stack, unused */ elf_rel_set_symbol(&info->rhdr, "entry32_regs", ®s, sizeof(regs)); } else { die("Unknown argument style\n"); } return 0; }
|
/* Loads additional segments in case of a panic kernel is being loaded. * One segment for backup region, another segment for storing elf headers * for crash memory image. */ int load_crashdump_segments(struct kexec_info *info, char* mod_cmdline, unsigned long max_addr, unsigned long min_base)
{ void *tmp; unsigned long sz, elfcorehdr; int nr_ranges, align = 1024; struct memory_range *mem_range, *memmap_p;
if (get_crash_memory_ranges(&mem_range, &nr_ranges, info->kexec_flags) < 0) return -1;
/* * if the core type has not been set on command line, set it here * automatically */ if (arch_options.core_header_type == CORE_TYPE_UNDEF) { arch_options.core_header_type = get_core_type(info, mem_range, nr_ranges); }
/* 1.Memory regions which panic kernel can safely use to boot into */ sz = (sizeof(struct memory_range) * (KEXEC_MAX_SEGMENTS + 1)); memmap_p = xmalloc(sz); memset(memmap_p, 0, sz); add_memmap(memmap_p, BACKUP_SRC_START, BACKUP_SRC_SIZE);//第一块!
sz = crash_reserved_mem.end - crash_reserved_mem.start +1; add_memmap(memmap_p, crash_reserved_mem.start, sz);//第二块!
/* 2.Create a backup region segment to store backup data*/ if (!(info->kexec_flags & KEXEC_PRESERVE_CONTEXT)) { sz = (BACKUP_SRC_SIZE + align - 1) & ~(align - 1); tmp = xmalloc(sz); memset(tmp, 0, sz); info->backup_start = add_buffer(info, tmp, sz, sz, align, 0, max_addr, -1); dbgprintf("Created backup segment at 0x%lx\n", info->backup_start); if (delete_memmap(memmap_p, info->backup_start, sz) < 0) return -1; }
/* 3.Create elf header segment and store crash image (1st or 2nd????????)data. */ if (arch_options.core_header_type == CORE_TYPE_ELF64) { if (crash_create_elf64_headers(info, &elf_info64, crash_memory_range, nr_ranges, &tmp, &sz, ELF_CORE_HEADER_ALIGN) < 0) return -1; } else { if (crash_create_elf32_headers(info, &elf_info32, crash_memory_range, nr_ranges, &tmp, &sz, ELF_CORE_HEADER_ALIGN) < 0)//哪里定义的??????noted by peter.guo
return -1; }
/* Hack: With some ld versions (GNU ld version 2.14.90.0.4 20030523), * vmlinux program headers show a gap of two pages between bss segment * and data segment but effectively kernel considers it as bss segment * and overwrites the any data placed there. Hence bloat (使膨胀)the memsz of * elf core header segment to 16K to avoid being placed in such gaps. * This is a makeshift solution until it is fixed in kernel. */ elfcorehdr = add_buffer(info, tmp, sz, 16*1024, align, min_base, max_addr, -1); dbgprintf("Created elf header segment at 0x%lx\n", elfcorehdr); if (delete_memmap(memmap_p, elfcorehdr, sz) < 0) return -1; cmdline_add_memmap(mod_cmdline, memmap_p);
cmdline_add_elfcorehdr(mod_cmdline, elfcorehdr);
//为啥此处没有K# 和K$形式的 ???????
return 0; }
|
|