kernel hacker修炼之道之内存管理-物理内存探测

浅析linux内核内存管理之物理内存探测

作者:李万鹏


在系统boot的时候,kernel通过0x15中断获得机器内存容量。有三种参数88H(只能探测最大64MB的内存),E801H(得到大小),E802H(获得memory map)。这个memory map称为E820图,在kernel的初始化代码中会将这个memory map复制到一个kernel中的数据结构e820map里,kernel需要通过这个结构来计算可用的内存容量。
struct e820map { int nr_map; struct e820entry { unsigned long long addr; /* start of memory segment */ unsigned long long size; /* size of memory segment */ unsigned long type; /* type of memory segment */ } map[E820MAX]; };
  • 这里的nr_map是内存段的数量
  • 每个内存段由struct e820entry表示
  • addr字段表示内存段的起始地址
  • size字段表示内存段的大小
  • type表示内存段的类型,比如E820_RAM表示可用内存
  • E820MAX是一个宏,为32,说明最多可以有32个内存段
在setup_arch函数中有这么两句,调用mach_specific_memory_setup将E820图复制到kernel中的数据结构中,包括了系统保留的段和空闲段,通过print_memory_map函数打印出来。
printk(KERN_INFO "BIOS-provided physical RAM map:\n"); print_memory_map(machine_specific_memory_setup());下面来看machine_specific_memory_setup函数的实现:
static char * __init machine_specific_memory_setup(void) { char *who; who = "BIOS-e820"; /* * Try to copy the BIOS-supplied E820-map. * * Otherwise fake a memory map; one section from 0k->640k, * the next section from 1mb->appropriate_mem_k */ sanitize_e820_map(E820_MAP, &E820_MAP_NR); if (copy_e820_map(E820_MAP, E820_MAP_NR) < 0) { unsigned long mem_size; /* compare results from other methods and take the greater */ if (ALT_MEM_K < EXT_MEM_K) { mem_size = EXT_MEM_K; who = "BIOS-88"; } else { mem_size = ALT_MEM_K; who = "BIOS-e801"; } e820.nr_map = 0; add_memory_region(0, LOWMEMSIZE(), E820_RAM); add_memory_region(HIGH_MEMORY, mem_size << 10, E820_RAM); } return who; }
  • 首先调用sanitize_e820_map函数将重叠的去除
  • 调用copy_e820_map函数将E820图copy到struct e820map结构中
  • 如果BIOS没有提供该信息(在较古老的机器上可能是这样),内存自身生成一个表,0~0x9f000 ,1MB~E801或88找到的最大值
copy_e820_map函数实现:
static int __init copy_e820_map(struct e820entry * biosmap, int nr_map) { /* Only one memory region (or negative)? Ignore it */ if (nr_map < 2) return -1; do { unsigned long long start = biosmap->addr; unsigned long long size = biosmap->size; unsigned long long end = start + size; unsigned long type = biosmap->type; /* Overflow in 64 bits? Ignore the memory map. */ if (start > end) return -1; /* * Some BIOSes claim RAM in the 640k - 1M region. * Not right. Fix it up. */ if (type == E820_RAM) { if (start < 0x100000ULL && end > 0xA0000ULL) { if (start < 0xA0000ULL) add_memory_region(start, 0xA0000ULL-start, type); if (end <= 0x100000ULL) continue; start = 0x100000ULL; size = end - start; } } add_memory_region(start, size, type); } while (biosmap++,--nr_map); return 0; }
  • 至少BIOS与RAM不是一个内存段的,所以nr_map < 2肯定是不对的
  • 调用add_memory_region函数将E820图填充到struct e820map结构中
  • 如果类型为E820_RAM,即可用内存,判断这个范围是否覆盖640KB~1MB,这段需要为ISA图形卡等保留的,所以这段要保留,如果谁覆盖了这段需要把这段抠除。物理地址从0x000a0000到0x000fffff的范围通常留给BIOS例程,并且映射ISA图形卡上的内部内存。这个区域就是所有的IBM兼容PC上从640KB~1MB之间著名的空洞:物理地址存在但被保留,对应的页框不能由操作系统使用。
调用add_memory_region添加相应的内存段到e820map:
static void __init add_memory_region(unsigned long long start, unsigned long long size, int type) { int x; if (!efi_enabled) { x = e820.nr_map; if (x == E820MAX) { printk(KERN_ERR "Ooops! Too many entries in the memory map!\n"); return; } e820.map[x].addr = start; e820.map[x].size = size; e820.map[x].type = type; e820.nr_map++; } } /* add_memory_region */如果内存段数量达到了最大值E820MAX即32,则oops。
static void __init print_memory_map(char *who) { int i; for (i = 0; i < e820.nr_map; i++) { printk(" %s: %016Lx - %016Lx ", who, e820.map[i].addr, e820.map[i].addr + e820.map[i].size); switch (e820.map[i].type) { case E820_RAM: printk("(usable)\n"); break; case E820_RESERVED: printk("(reserved)\n"); break; case E820_ACPI: printk("(ACPI data)\n"); break; case E820_NVS: printk("(ACPI NVS)\n"); break; default: printk("type %lu\n", e820.map[i].type); break; } } }调用print_memory_map打印出各个内存段的范围和类型,我的内存是2G的,打印结果如下:
[ 0.000000] BIOS-provided physical RAM map: [ 0.000000] BIOS-e820: 0000000000000000 - 000000000009f000 (usable) [ 0.000000] BIOS-e820: 000000000009f000 - 00000000000a0000 (reserved) [ 0.000000] BIOS-e820: 00000000000f0000 - 0000000000100000 (reserved) [ 0.000000] BIOS-e820: 0000000000100000 - 0000000001e00000 (usable) [ 0.000000] BIOS-e820: 0000000001e00000 - 0000000001e80040 (reserved) [ 0.000000] BIOS-e820: 0000000001e80040 - 000000007bed0000 (usable) [ 0.000000] BIOS-e820: 000000007bed0000 - 000000007bed3000 (ACPI NVS) [ 0.000000] BIOS-e820: 000000007bed3000 - 000000007bee0000 (ACPI data) [ 0.000000] BIOS-e820: 000000007bee0000 - 000000007bf00000 (reserved) [ 0.000000] BIOS-e820: 000000007c000000 - 0000000080000000 (reserved) [ 0.000000] BIOS-e820: 00000000f0000000 - 00000000f4000000 (reserved) [ 0.000000] BIOS-e820: 00000000fec00000 - 0000000100000000 (reserved)至此,kernel已经成功的通过0x 15参数E820H,得到BIOS的E820图,并将其填充内核中的e820map结构,供内核其他部分使用。

kernel hacker修炼之道之内存管理-物理内存探测
在setup_memory函数中会调用find_max_pfn,从e820map结构中获得可用内存的容量。下面来看几个宏:
#define PFN_UP(x) (((x) + PAGE_SIZE-1) >> PAGE_SHIFT) #define PFN_DOWN(x) ((x) >> PAGE_SHIFT) #define PFN_PHYS(x) ((x) << PAGE_SHIFT) /* * Reserved space for vmalloc and iomap - defined in asm/page.h */ #define MAXMEM (-__PAGE_OFFSET-__VMALLOC_RESERVE) #define MAXMEM_PFN PFN_DOWN(MAXMEM) #define MAX_NONPAE_PFN (1 << 20)
  • PFN_UP,PFN_DOWN都是返回地址x对应的页帧号只是PFN_UP返回的是x地址下一个页的页帧号,PFN_DOWN返回的是x所在页的页帧号
  • PFN_PHYS获得页帧号对应的物理地址
  • MAXMEM是低端内存的最大值
  • MAXMEM_PFN是低端内存最大一个页的页帧号
  • MAX_NONPAE_PFN是给出4GB之上第一个页面的页面号

setup_memory是与体系结构密切相关的函数,跟踪其实现:
static unsigned long __init setup_memory(void) { unsigned long bootmap_size, start_pfn, max_low_pfn; /* * partially used pages are not usable - thus * we are rounding upwards: */ start_pfn = PFN_UP(init_pg_tables_end); find_max_pfn(); max_low_pfn = find_max_low_pfn(); #ifdef CONFIG_HIGHMEM highstart_pfn = highend_pfn = max_pfn; if (max_pfn > max_low_pfn) { highstart_pfn = max_low_pfn; } printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", pages_to_mb(highend_pfn - highstart_pfn)); #endif printk(KERN_NOTICE "%ldMB LOWMEM available.\n", pages_to_mb(max_low_pfn)); /* * Initialize the boot-time allocator (with low memory only): */ bootmap_size = init_bootmem(start_pfn, max_low_pfn); register_bootmem_low_pages(max_low_pfn); /* * Reserve the bootmem bitmap itself as well. We do this in two * steps (first step was init_bootmem()) because this catches * the (very unlikely) case of us accidentally initializing the * bootmem allocator with an invalid RAM area. */ reserve_bootmem(HIGH_MEMORY, (PFN_PHYS(start_pfn) + bootmap_size + PAGE_SIZE-1) - (HIGH_MEMORY)); /* * reserve physical page 0 - it's a special BIOS page on many boxes, * enabling clean reboots, SMP operation, laptop functions. */ reserve_bootmem(0, PAGE_SIZE); 。。。。。。。。。。。。 return max_low_pfn; }
  • PFN_UP获得_end后第一个page的页帧号来初始化start_pfn
  • 调用find_max_low_pfn获得低端内存的最大页帧号
  • 如果配置了CONFIG_HIGHMEM,则初始化highstart_pfn变量
  • 调用init_bootmem初始化bootmem allocator
  • 将max_low_pfn,即直接内存映射部分的page设置为可用
  • 保留内核镜像(从0x100000开始,kernel code, kernel data, kernel bss),page 0,bootmem allocator的bitmap占用的空间
  • 然会低端内存的最大页帧号
下面来看查找最大内存的函数:
void __init find_max_pfn(void) { int i; max_pfn = 0; if (efi_enabled) { efi_memmap_walk(efi_find_max_pfn, &max_pfn); return; } for (i = 0; i < e820.nr_map; i++) { unsigned long start, end; /* RAM? */ if (e820.map[i].type != E820_RAM) continue; start = PFN_UP(e820.map[i].addr); end = PFN_DOWN(e820.map[i].addr + e820.map[i].size); if (start >= end) continue; if (end > max_pfn) max_pfn = end; } }
  • 循环遍历e820map,获得最大物理页帧号
unsigned long __init find_max_low_pfn(void) { unsigned long max_low_pfn; max_low_pfn = max_pfn; if (max_low_pfn > MAXMEM_PFN) { if (highmem_pages == -1) highmem_pages = max_pfn - MAXMEM_PFN; if (highmem_pages + MAXMEM_PFN < max_pfn) max_pfn = MAXMEM_PFN + highmem_pages; if (highmem_pages + MAXMEM_PFN > max_pfn) { printk("only %luMB highmem pages available, ignoring highmem size of %uMB.\n", pages_to_mb(max_pfn - MAXMEM_PFN), pages_to_mb(highmem_pages)); highmem_pages = 0; } max_low_pfn = MAXMEM_PFN; #ifndef CONFIG_HIGHMEM /* Maximum memory usable is what is directly addressable */ printk(KERN_WARNING "Warning only %ldMB will be used.\n", MAXMEM>>20); if (max_pfn > MAX_NONPAE_PFN) printk(KERN_WARNING "Use a PAE enabled kernel.\n"); else printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n"); max_pfn = MAXMEM_PFN; #else /* !CONFIG_HIGHMEM */ #ifndef CONFIG_X86_PAE if (max_pfn > MAX_NONPAE_PFN) { max_pfn = MAX_NONPAE_PFN; printk(KERN_WARNING "Warning only 4GB will be used.\n"); printk(KERN_WARNING "Use a PAE enabled kernel.\n"); } #endif /* !CONFIG_X86_PAE */ #endif /* !CONFIG_HIGHMEM */ } else { if (highmem_pages == -1) highmem_pages = 0; #ifdef CONFIG_HIGHMEM if (highmem_pages >= max_pfn) { printk(KERN_ERR "highmem size specified (%uMB) is bigger than pages available (%luMB)!.\n", pages_to_mb(highmem_pages), pages_to_mb(max_pfn)); highmem_pages = 0; } if (highmem_pages) { if (max_low_pfn-highmem_pages < 64*1024*1024/PAGE_SIZE){ printk(KERN_ERR "highmem size %uMB results in smaller than 64MB lowmem, ignoring it.\n", pages_to_mb(highmem_pages)); highmem_pages = 0; } max_low_pfn -= highmem_pages; } #else if (highmem_pages) printk(KERN_ERR "ignoring highmem size on non-highmem kernel!\n"); #endif } return max_low_pfn; }
  • 这里分两种情况进行讨论,一个是内存大于896MB,一个是内存小于896MB
  • max_low_pfn > MAXMEM_PFN下的#ifndef CONFIG_HIGHMEM会设置max_pfn = MAXMEM_PFN;看出如果不开启highmem,即使内存大于896MB,也只能使用896MB