/* * kexec: Linux boots Linux * * Created by: R Sharada (sharada@in.ibm.com) * Copyright (C) IBM Corporation, 2005. All rights reserved * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation (version 2 of the License). * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #include #include #include #include #include #include #include #include #include #include #include #include "../../kexec.h" #include "../../kexec-elf.h" #include "../../kexec-syscall.h" #include "../../crashdump.h" #include "kexec-ppc64.h" #include "../../fs2dt.h" #include "crashdump-ppc64.h" #define DEVTREE_CRASHKERNEL_BASE "/proc/device-tree/chosen/linux,crashkernel-base" #define DEVTREE_CRASHKERNEL_SIZE "/proc/device-tree/chosen/linux,crashkernel-size" static struct crash_elf_info elf_info64 = { class: ELFCLASS64, #if BYTE_ORDER == LITTLE_ENDIAN data: ELFDATA2LSB, #else data: ELFDATA2MSB, #endif machine: EM_PPC64, page_offset: PAGE_OFFSET, lowmem_limit: MAXMEM, }; static struct crash_elf_info elf_info32 = { class: ELFCLASS32, data: ELFDATA2MSB, machine: EM_PPC64, page_offset: PAGE_OFFSET, lowmem_limit: MAXMEM, }; extern struct arch_options_t arch_options; /* Stores a sorted list of RAM memory ranges for which to create elf headers. * A separate program header is created for backup region */ static struct memory_range *crash_memory_range = NULL; /* Define a variable to replace the CRASH_MAX_MEMORY_RANGES macro */ static int crash_max_memory_ranges; /* * Used to save various memory ranges/regions needed for the captured * kernel to boot. (lime memmap= option in other archs) */ mem_rgns_t usablemem_rgns = {0, NULL}; static unsigned long long cstart, cend; static int memory_ranges; /* * Exclude the region that lies within crashkernel and above the memory * limit which is reflected by mem= kernel option. */ static void exclude_crash_region(uint64_t start, uint64_t end) { /* If memory_limit is set then exclude the memory region above it. */ if (memory_limit) { if (start >= memory_limit) return; if (end > memory_limit) end = memory_limit; } if (cstart < end && cend > start) { if (start < cstart && end > cend) { crash_memory_range[memory_ranges].start = start; crash_memory_range[memory_ranges].end = cstart; crash_memory_range[memory_ranges].type = RANGE_RAM; memory_ranges++; crash_memory_range[memory_ranges].start = cend; crash_memory_range[memory_ranges].end = end; crash_memory_range[memory_ranges].type = RANGE_RAM; memory_ranges++; } else if (start < cstart) { crash_memory_range[memory_ranges].start = start; crash_memory_range[memory_ranges].end = cstart; crash_memory_range[memory_ranges].type = RANGE_RAM; memory_ranges++; } else if (end > cend) { crash_memory_range[memory_ranges].start = cend; crash_memory_range[memory_ranges].end = end; crash_memory_range[memory_ranges].type = RANGE_RAM; memory_ranges++; } } else { crash_memory_range[memory_ranges].start = start; crash_memory_range[memory_ranges].end = end; crash_memory_range[memory_ranges].type = RANGE_RAM; memory_ranges++; } } static int get_dyn_reconf_crash_memory_ranges(void) { uint64_t start, end; uint64_t startrange, endrange; char fname[128], buf[32]; FILE *file; unsigned int i; int n; uint32_t flags; strcpy(fname, "/proc/device-tree/"); strcat(fname, "ibm,dynamic-reconfiguration-memory/ibm,dynamic-memory"); if ((file = fopen(fname, "r")) == NULL) { perror(fname); return -1; } fseek(file, 4, SEEK_SET); startrange = endrange = 0; for (i = 0; i < num_of_lmbs; i++) { if ((n = fread(buf, 1, 24, file)) < 0) { perror(fname); fclose(file); return -1; } if (memory_ranges >= (max_memory_ranges + 1)) { /* No space to insert another element. */ fprintf(stderr, "Error: Number of crash memory ranges" " excedeed the max limit\n"); return -1; } start = be64_to_cpu(((uint64_t *)buf)[DRCONF_ADDR]); end = start + lmb_size; if (start == 0 && end >= (BACKUP_SRC_END + 1)) start = BACKUP_SRC_END + 1; flags = be32_to_cpu((*((uint32_t *)&buf[DRCONF_FLAGS]))); /* skip this block if the reserved bit is set in flags (0x80) or if the block is not assigned to this partition (0x8) */ if ((flags & 0x80) || !(flags & 0x8)) continue; if (start != endrange) { if (startrange != endrange) exclude_crash_region(startrange, endrange); startrange = start; } endrange = end; } if (startrange != endrange) exclude_crash_region(startrange, endrange); fclose(file); return 0; } /* * For a given memory node, check if it is mapped to system RAM or * to onboard memory on accelerator device like GPU card or such. */ static int is_coherent_device_mem(const char *fname) { char fpath[PATH_LEN]; char buf[32]; DIR *dmem; FILE *file; struct dirent *mentry; int cnt, ret = 0; strcpy(fpath, fname); if ((dmem = opendir(fpath)) == NULL) { perror(fpath); return -1; } while ((mentry = readdir(dmem)) != NULL) { if (strcmp(mentry->d_name, "compatible")) continue; strcat(fpath, "/compatible"); if ((file = fopen(fpath, "r")) == NULL) { perror(fpath); ret = -1; break; } if ((cnt = fread(buf, 1, 32, file)) < 0) { perror(fpath); fclose(file); ret = -1; break; } if (!strncmp(buf, "ibm,coherent-device-memory", 26)) { fclose(file); ret = 1; break; } fclose(file); } closedir(dmem); return ret; } /* Reads the appropriate file and retrieves the SYSTEM RAM regions for whom to * create Elf headers. Keeping it separate from get_memory_ranges() as * requirements are different in the case of normal kexec and crashdumps. * * Normal kexec needs to look at all of available physical memory irrespective * of the fact how much of it is being used by currently running kernel. * Crashdumps need to have access to memory regions actually being used by * running kernel. Expecting a different file/data structure than /proc/iomem * to look into down the line. May be something like /proc/kernelmem or may * be zone data structures exported from kernel. */ static int get_crash_memory_ranges(struct memory_range **range, int *ranges) { char device_tree[256] = "/proc/device-tree/"; char fname[PATH_LEN]; char buf[MAXBYTES]; DIR *dir, *dmem; FILE *file; struct dirent *dentry, *mentry; int n, ret, crash_rng_len = 0; unsigned long long start, end; int page_size; crash_max_memory_ranges = max_memory_ranges + 6; crash_rng_len = sizeof(struct memory_range) * crash_max_memory_ranges; crash_memory_range = (struct memory_range *) malloc(crash_rng_len); if (!crash_memory_range) { fprintf(stderr, "Allocation for crash memory range failed\n"); return -1; } memset(crash_memory_range, 0, crash_rng_len); /* create a separate program header for the backup region */ crash_memory_range[0].start = BACKUP_SRC_START; crash_memory_range[0].end = BACKUP_SRC_END + 1; crash_memory_range[0].type = RANGE_RAM; memory_ranges++; if ((dir = opendir(device_tree)) == NULL) { perror(device_tree); goto err; } cstart = crash_base; cend = crash_base + crash_size; while ((dentry = readdir(dir)) != NULL) { if (!strncmp(dentry->d_name, "ibm,dynamic-reconfiguration-memory", 35)){ get_dyn_reconf_crash_memory_ranges(); continue; } if (strncmp(dentry->d_name, "memory@", 7) && strcmp(dentry->d_name, "memory")) continue; strcpy(fname, device_tree); strcat(fname, dentry->d_name); ret = is_coherent_device_mem(fname); if (ret == -1) { closedir(dir); goto err; } else if (ret == 1) { /* * Avoid adding this memory region as it is not * mapped to system RAM. */ continue; } if ((dmem = opendir(fname)) == NULL) { perror(fname); closedir(dir); goto err; } while ((mentry = readdir(dmem)) != NULL) { if (strcmp(mentry->d_name, "reg")) continue; strcat(fname, "/reg"); if ((file = fopen(fname, "r")) == NULL) { perror(fname); closedir(dmem); closedir(dir); goto err; } if ((n = fread(buf, 1, MAXBYTES, file)) < 0) { perror(fname); fclose(file); closedir(dmem); closedir(dir); goto err; } if (memory_ranges >= (max_memory_ranges + 1)) { /* No space to insert another element. */ fprintf(stderr, "Error: Number of crash memory ranges" " excedeed the max limit\n"); goto err; } start = be64_to_cpu(((unsigned long long *)buf)[0]); end = start + be64_to_cpu(((unsigned long long *)buf)[1]); if (start == 0 && end >= (BACKUP_SRC_END + 1)) start = BACKUP_SRC_END + 1; exclude_crash_region(start, end); fclose(file); } closedir(dmem); } closedir(dir); /* * If RTAS region is overlapped with crashkernel, need to create ELF * Program header for the overlapped memory. */ if (crash_base < rtas_base + rtas_size && rtas_base < crash_base + crash_size) { page_size = getpagesize(); cstart = rtas_base; cend = rtas_base + rtas_size; if (cstart < crash_base) cstart = crash_base; if (cend > crash_base + crash_size) cend = crash_base + crash_size; /* * The rtas section created here is formed by reading rtas-base * and rtas-size from /proc/device-tree/rtas. Unfortunately * rtas-size is not required to be a multiple of PAGE_SIZE * The remainder of the page it ends on is just garbage, and is * safe to read, its just not accounted in rtas-size. Since * we're creating an elf section here though, lets round it up * to the next page size boundary though, so makedumpfile can * read it safely without going south on us. */ cend = _ALIGN(cend, page_size); crash_memory_range[memory_ranges].start = cstart; crash_memory_range[memory_ranges++].end = cend; } /* * If OPAL region is overlapped with crashkernel, need to create ELF * Program header for the overlapped memory. */ if (crash_base < opal_base + opal_size && opal_base < crash_base + crash_size) { page_size = getpagesize(); cstart = opal_base; cend = opal_base + opal_size; if (cstart < crash_base) cstart = crash_base; if (cend > crash_base + crash_size) cend = crash_base + crash_size; /* * The opal section created here is formed by reading opal-base * and opal-size from /proc/device-tree/ibm,opal. Unfortunately * opal-size is not required to be a multiple of PAGE_SIZE * The remainder of the page it ends on is just garbage, and is * safe to read, its just not accounted in opal-size. Since * we're creating an elf section here though, lets round it up * to the next page size boundary though, so makedumpfile can * read it safely without going south on us. */ cend = _ALIGN(cend, page_size); crash_memory_range[memory_ranges].start = cstart; crash_memory_range[memory_ranges++].end = cend; } *range = crash_memory_range; *ranges = memory_ranges; int j; dbgprintf("CRASH MEMORY RANGES\n"); for(j = 0; j < *ranges; j++) { start = crash_memory_range[j].start; end = crash_memory_range[j].end; dbgprintf("%016Lx-%016Lx\n", start, end); } return 0; err: if (crash_memory_range) free(crash_memory_range); return -1; } /* Converts unsigned long to ascii string. */ static void ultoa(uint64_t i, char *str) { int j = 0, k; char tmp; do { str[j++] = i % 10 + '0'; } while ((i /=10) > 0); str[j] = '\0'; /* Reverse the string. */ for (j = 0, k = strlen(str) - 1; j < k; j++, k--) { tmp = str[k]; str[k] = str[j]; str[j] = tmp; } } static int add_cmdline_param(char *cmdline, uint64_t addr, char *cmdstr, char *byte) { int cmdline_size, cmdlen, len, align = 1024; char str[COMMAND_LINE_SIZE], *ptr; /* Passing in =xxxK / =xxxM format. Saves space required in cmdline.*/ switch (byte[0]) { case 'K': if (addr%align) return -1; addr = addr/align; break; case 'M': addr = addr/(align *align); break; } ptr = str; strcpy(str, cmdstr); ptr += strlen(str); ultoa(addr, ptr); strcat(str, byte); len = strlen(str); cmdlen = strlen(cmdline) + len; cmdline_size = (kernel_version() < KERNEL_VERSION(3, 15, 0) ? 512 : COMMAND_LINE_SIZE); if (cmdlen > (cmdline_size - 1)) die("Command line overflow\n"); strcat(cmdline, str); dbgprintf("Command line after adding elfcorehdr: %s\n", cmdline); return 0; } /* Loads additional segments in case of a panic kernel is being loaded. * One segment for backup region, another segment for storing elf headers * for crash memory image. */ int load_crashdump_segments(struct kexec_info *info, char* mod_cmdline, uint64_t max_addr, unsigned long min_base) { void *tmp; unsigned long sz; uint64_t elfcorehdr; int nr_ranges, align = 1024, i; unsigned long long end; struct memory_range *mem_range; if (get_crash_memory_ranges(&mem_range, &nr_ranges) < 0) return -1; info->backup_src_start = BACKUP_SRC_START; info->backup_src_size = BACKUP_SRC_SIZE; /* Create a backup region segment to store backup data*/ sz = _ALIGN(BACKUP_SRC_SIZE, align); tmp = xmalloc(sz); memset(tmp, 0, sz); info->backup_start = add_buffer(info, tmp, sz, sz, align, 0, max_addr, 1); reserve(info->backup_start, sz); /* On ppc64 memory ranges in device-tree is denoted as start * and size rather than start and end, as is the case with * other architectures like i386 . Because of this when loading * the memory ranges in crashdump-elf.c the filesz calculation * [ end - start + 1 ] goes for a toss. * * To be in sync with other archs adjust the end value for * every crash memory range before calling the generic function */ for (i = 0; i < nr_ranges; i++) { end = crash_memory_range[i].end - 1; crash_memory_range[i].end = end; } /* Create elf header segment and store crash image data. */ if (arch_options.core_header_type == CORE_TYPE_ELF64) { if (crash_create_elf64_headers(info, &elf_info64, crash_memory_range, nr_ranges, &tmp, &sz, ELF_CORE_HEADER_ALIGN) < 0) return -1; } else { if (crash_create_elf32_headers(info, &elf_info32, crash_memory_range, nr_ranges, &tmp, &sz, ELF_CORE_HEADER_ALIGN) < 0) return -1; } elfcorehdr = add_buffer(info, tmp, sz, sz, align, min_base, max_addr, 1); reserve(elfcorehdr, sz); /* modify and store the cmdline in a global array. This is later * read by flatten_device_tree and modified if required */ add_cmdline_param(mod_cmdline, elfcorehdr, " elfcorehdr=", "K"); return 0; } /* * Used to save various memory regions needed for the captured kernel. */ void add_usable_mem_rgns(unsigned long long base, unsigned long long size) { unsigned int i; unsigned long long end = base + size; unsigned long long ustart, uend; base = _ALIGN_DOWN(base, getpagesize()); end = _ALIGN_UP(end, getpagesize()); for (i=0; i < usablemem_rgns.size; i++) { ustart = usablemem_rgns.ranges[i].start; uend = usablemem_rgns.ranges[i].end; if (base < uend && end > ustart) { if ((base >= ustart) && (end <= uend)) return; if (base < ustart && end > uend) { usablemem_rgns.ranges[i].start = base; usablemem_rgns.ranges[i].end = end; #ifdef DEBUG fprintf(stderr, "usable memory rgn %u: new base:%llx new size:%llx\n", i, base, size); #endif return; } else if (base < ustart) { usablemem_rgns.ranges[i].start = base; #ifdef DEBUG fprintf(stderr, "usable memory rgn %u: new base:%llx new size:%llx", i, base, usablemem_rgns.ranges[i].end - base); #endif return; } else if (end > uend){ usablemem_rgns.ranges[i].end = end; #ifdef DEBUG fprintf(stderr, "usable memory rgn %u: new end:%llx, new size:%llx", i, end, end - usablemem_rgns.ranges[i].start); #endif return; } } } usablemem_rgns.ranges[usablemem_rgns.size].start = base; usablemem_rgns.ranges[usablemem_rgns.size++].end = end; dbgprintf("usable memory rgns size:%u base:%llx size:%llx\n", usablemem_rgns.size, base, size); } int get_crash_kernel_load_range(uint64_t *start, uint64_t *end) { unsigned long long value; if (!get_devtree_value(DEVTREE_CRASHKERNEL_BASE, &value)) *start = value; else return -1; if (!get_devtree_value(DEVTREE_CRASHKERNEL_SIZE, &value)) *end = *start + value - 1; else return -1; return 0; } int is_crashkernel_mem_reserved(void) { int fd; fd = open(DEVTREE_CRASHKERNEL_BASE, O_RDONLY); if (fd < 0) return 0; close(fd); return 1; } #if 0 static int sort_regions(mem_rgns_t *rgn) { int i, j; unsigned long long tstart, tend; for (i = 0; i < rgn->size; i++) { for (j = 0; j < rgn->size - i - 1; j++) { if (rgn->ranges[j].start > rgn->ranges[j+1].start) { tstart = rgn->ranges[j].start; tend = rgn->ranges[j].end; rgn->ranges[j].start = rgn->ranges[j+1].start; rgn->ranges[j].end = rgn->ranges[j+1].end; rgn->ranges[j+1].start = tstart; rgn->ranges[j+1].end = tend; } } } return 0; } #endif