kernel hacker修炼之道之内存管理-物理内存探测
浅析linux内核内存管理之物理内存探测

作者:李万鹏
在系统boot的时候,kernel通过0x15中断获得机器内存容量。有三种参数88H(只能探测最大64MB的内存),E801H(得到大小),E802H(获得memory map)。这个memory map称为E820图,在kernel的初始化代码中会将这个memory map复制到一个kernel中的数据结构e820map里,kernel需要通过这个结构来计算可用的内存容量。
struct e820map {
int nr_map;
struct e820entry {
unsigned long long addr; /* start of memory segment */
unsigned long long size; /* size of memory segment */
unsigned long type; /* type of memory segment */
} map[E820MAX];
};
- 这里的nr_map是内存段的数量
- 每个内存段由struct e820entry表示
- addr字段表示内存段的起始地址
- size字段表示内存段的大小
- type表示内存段的类型,比如E820_RAM表示可用内存
- E820MAX是一个宏,为32,说明最多可以有32个内存段
在setup_arch函数中有这么两句,调用mach_specific_memory_setup将E820图复制到kernel中的数据结构中,包括了系统保留的段和空闲段,通过print_memory_map函数打印出来。
printk(KERN_INFO "BIOS-provided physical RAM map:\n");
print_memory_map(machine_specific_memory_setup());下面来看machine_specific_memory_setup函数的实现:
static char * __init machine_specific_memory_setup(void)
{
char *who;
who = "BIOS-e820";
/*
* Try to copy the BIOS-supplied E820-map.
*
* Otherwise fake a memory map; one section from 0k->640k,
* the next section from 1mb->appropriate_mem_k
*/
sanitize_e820_map(E820_MAP, &E820_MAP_NR);
if (copy_e820_map(E820_MAP, E820_MAP_NR) < 0) {
unsigned long mem_size;
/* compare results from other methods and take the greater */
if (ALT_MEM_K < EXT_MEM_K) {
mem_size = EXT_MEM_K;
who = "BIOS-88";
} else {
mem_size = ALT_MEM_K;
who = "BIOS-e801";
}
e820.nr_map = 0;
add_memory_region(0, LOWMEMSIZE(), E820_RAM);
add_memory_region(HIGH_MEMORY, mem_size << 10, E820_RAM);
}
return who;
}
- 首先调用sanitize_e820_map函数将重叠的去除
- 调用copy_e820_map函数将E820图copy到struct e820map结构中
- 如果BIOS没有提供该信息(在较古老的机器上可能是这样),内存自身生成一个表,0~0x9f000 ,1MB~E801或88找到的最大值
static int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
{
/* Only one memory region (or negative)? Ignore it */
if (nr_map < 2)
return -1;
do {
unsigned long long start = biosmap->addr;
unsigned long long size = biosmap->size;
unsigned long long end = start + size;
unsigned long type = biosmap->type;
/* Overflow in 64 bits? Ignore the memory map. */
if (start > end)
return -1;
/*
* Some BIOSes claim RAM in the 640k - 1M region.
* Not right. Fix it up.
*/
if (type == E820_RAM) {
if (start < 0x100000ULL && end > 0xA0000ULL) {
if (start < 0xA0000ULL)
add_memory_region(start, 0xA0000ULL-start, type);
if (end <= 0x100000ULL)
continue;
start = 0x100000ULL;
size = end - start;
}
}
add_memory_region(start, size, type);
} while (biosmap++,--nr_map);
return 0;
}
- 至少BIOS与RAM不是一个内存段的,所以nr_map < 2肯定是不对的
- 调用add_memory_region函数将E820图填充到struct e820map结构中
- 如果类型为E820_RAM,即可用内存,判断这个范围是否覆盖640KB~1MB,这段需要为ISA图形卡等保留的,所以这段要保留,如果谁覆盖了这段需要把这段抠除。物理地址从0x000a0000到0x000fffff的范围通常留给BIOS例程,并且映射ISA图形卡上的内部内存。这个区域就是所有的IBM兼容PC上从640KB~1MB之间著名的空洞:物理地址存在但被保留,对应的页框不能由操作系统使用。
static void __init add_memory_region(unsigned long long start,
unsigned long long size, int type)
{
int x;
if (!efi_enabled) {
x = e820.nr_map;
if (x == E820MAX) {
printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
return;
}
e820.map[x].addr = start;
e820.map[x].size = size;
e820.map[x].type = type;
e820.nr_map++;
}
} /* add_memory_region */如果内存段数量达到了最大值E820MAX即32,则oops。
static void __init print_memory_map(char *who)
{
int i;
for (i = 0; i < e820.nr_map; i++) {
printk(" %s: %016Lx - %016Lx ", who,
e820.map[i].addr,
e820.map[i].addr + e820.map[i].size);
switch (e820.map[i].type) {
case E820_RAM: printk("(usable)\n");
break;
case E820_RESERVED:
printk("(reserved)\n");
break;
case E820_ACPI:
printk("(ACPI data)\n");
break;
case E820_NVS:
printk("(ACPI NVS)\n");
break;
default: printk("type %lu\n", e820.map[i].type);
break;
}
}
}调用print_memory_map打印出各个内存段的范围和类型,我的内存是2G的,打印结果如下:
[ 0.000000] BIOS-provided physical RAM map:
[ 0.000000] BIOS-e820: 0000000000000000 - 000000000009f000 (usable)
[ 0.000000] BIOS-e820: 000000000009f000 - 00000000000a0000 (reserved)
[ 0.000000] BIOS-e820: 00000000000f0000 - 0000000000100000 (reserved)
[ 0.000000] BIOS-e820: 0000000000100000 - 0000000001e00000 (usable)
[ 0.000000] BIOS-e820: 0000000001e00000 - 0000000001e80040 (reserved)
[ 0.000000] BIOS-e820: 0000000001e80040 - 000000007bed0000 (usable)
[ 0.000000] BIOS-e820: 000000007bed0000 - 000000007bed3000 (ACPI NVS)
[ 0.000000] BIOS-e820: 000000007bed3000 - 000000007bee0000 (ACPI data)
[ 0.000000] BIOS-e820: 000000007bee0000 - 000000007bf00000 (reserved)
[ 0.000000] BIOS-e820: 000000007c000000 - 0000000080000000 (reserved)
[ 0.000000] BIOS-e820: 00000000f0000000 - 00000000f4000000 (reserved)
[ 0.000000] BIOS-e820: 00000000fec00000 - 0000000100000000 (reserved)至此,kernel已经成功的通过0x 15参数E820H,得到BIOS的E820图,并将其填充内核中的e820map结构,供内核其他部分使用。
在setup_memory函数中会调用find_max_pfn,从e820map结构中获得可用内存的容量。下面来看几个宏:
#define PFN_UP(x) (((x) + PAGE_SIZE-1) >> PAGE_SHIFT)
#define PFN_DOWN(x) ((x) >> PAGE_SHIFT)
#define PFN_PHYS(x) ((x) << PAGE_SHIFT)
/*
* Reserved space for vmalloc and iomap - defined in asm/page.h
*/
#define MAXMEM (-__PAGE_OFFSET-__VMALLOC_RESERVE)
#define MAXMEM_PFN PFN_DOWN(MAXMEM)
#define MAX_NONPAE_PFN (1 << 20)
- PFN_UP,PFN_DOWN都是返回地址x对应的页帧号只是PFN_UP返回的是x地址下一个页的页帧号,PFN_DOWN返回的是x所在页的页帧号
- PFN_PHYS获得页帧号对应的物理地址
- MAXMEM是低端内存的最大值
- MAXMEM_PFN是低端内存最大一个页的页帧号
- MAX_NONPAE_PFN是给出4GB之上第一个页面的页面号
setup_memory是与体系结构密切相关的函数,跟踪其实现:
static unsigned long __init setup_memory(void)
{
unsigned long bootmap_size, start_pfn, max_low_pfn;
/*
* partially used pages are not usable - thus
* we are rounding upwards:
*/
start_pfn = PFN_UP(init_pg_tables_end);
find_max_pfn();
max_low_pfn = find_max_low_pfn();
#ifdef CONFIG_HIGHMEM
highstart_pfn = highend_pfn = max_pfn;
if (max_pfn > max_low_pfn) {
highstart_pfn = max_low_pfn;
}
printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
pages_to_mb(highend_pfn - highstart_pfn));
#endif
printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
pages_to_mb(max_low_pfn));
/*
* Initialize the boot-time allocator (with low memory only):
*/
bootmap_size = init_bootmem(start_pfn, max_low_pfn);
register_bootmem_low_pages(max_low_pfn);
/*
* Reserve the bootmem bitmap itself as well. We do this in two
* steps (first step was init_bootmem()) because this catches
* the (very unlikely) case of us accidentally initializing the
* bootmem allocator with an invalid RAM area.
*/
reserve_bootmem(HIGH_MEMORY, (PFN_PHYS(start_pfn) +
bootmap_size + PAGE_SIZE-1) - (HIGH_MEMORY));
/*
* reserve physical page 0 - it's a special BIOS page on many boxes,
* enabling clean reboots, SMP operation, laptop functions.
*/
reserve_bootmem(0, PAGE_SIZE);
。。。。。。。。。。。。
return max_low_pfn;
}
- PFN_UP获得_end后第一个page的页帧号来初始化start_pfn
- 调用find_max_low_pfn获得低端内存的最大页帧号
- 如果配置了CONFIG_HIGHMEM,则初始化highstart_pfn变量
- 调用init_bootmem初始化bootmem allocator
- 将max_low_pfn,即直接内存映射部分的page设置为可用
- 保留内核镜像(从0x100000开始,kernel code, kernel data, kernel bss),page 0,bootmem allocator的bitmap占用的空间
- 然会低端内存的最大页帧号
下面来看查找最大内存的函数:
void __init find_max_pfn(void)
{
int i;
max_pfn = 0;
if (efi_enabled) {
efi_memmap_walk(efi_find_max_pfn, &max_pfn);
return;
}
for (i = 0; i < e820.nr_map; i++) {
unsigned long start, end;
/* RAM? */
if (e820.map[i].type != E820_RAM)
continue;
start = PFN_UP(e820.map[i].addr);
end = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
if (start >= end)
continue;
if (end > max_pfn)
max_pfn = end;
}
}
- 循环遍历e820map,获得最大物理页帧号
unsigned long __init find_max_low_pfn(void)
{
unsigned long max_low_pfn;
max_low_pfn = max_pfn;
if (max_low_pfn > MAXMEM_PFN) {
if (highmem_pages == -1)
highmem_pages = max_pfn - MAXMEM_PFN;
if (highmem_pages + MAXMEM_PFN < max_pfn)
max_pfn = MAXMEM_PFN + highmem_pages;
if (highmem_pages + MAXMEM_PFN > max_pfn) {
printk("only %luMB highmem pages available, ignoring highmem size of %uMB.\n", pages_to_mb(max_pfn - MAXMEM_PFN), pages_to_mb(highmem_pages));
highmem_pages = 0;
}
max_low_pfn = MAXMEM_PFN;
#ifndef CONFIG_HIGHMEM
/* Maximum memory usable is what is directly addressable */
printk(KERN_WARNING "Warning only %ldMB will be used.\n",
MAXMEM>>20);
if (max_pfn > MAX_NONPAE_PFN)
printk(KERN_WARNING "Use a PAE enabled kernel.\n");
else
printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
max_pfn = MAXMEM_PFN;
#else /* !CONFIG_HIGHMEM */
#ifndef CONFIG_X86_PAE
if (max_pfn > MAX_NONPAE_PFN) {
max_pfn = MAX_NONPAE_PFN;
printk(KERN_WARNING "Warning only 4GB will be used.\n");
printk(KERN_WARNING "Use a PAE enabled kernel.\n");
}
#endif /* !CONFIG_X86_PAE */
#endif /* !CONFIG_HIGHMEM */
} else {
if (highmem_pages == -1)
highmem_pages = 0;
#ifdef CONFIG_HIGHMEM
if (highmem_pages >= max_pfn) {
printk(KERN_ERR "highmem size specified (%uMB) is bigger than pages available (%luMB)!.\n", pages_to_mb(highmem_pages), pages_to_mb(max_pfn));
highmem_pages = 0;
}
if (highmem_pages) {
if (max_low_pfn-highmem_pages < 64*1024*1024/PAGE_SIZE){
printk(KERN_ERR "highmem size %uMB results in smaller than 64MB lowmem, ignoring it.\n", pages_to_mb(highmem_pages));
highmem_pages = 0;
}
max_low_pfn -= highmem_pages;
}
#else
if (highmem_pages)
printk(KERN_ERR "ignoring highmem size on non-highmem kernel!\n");
#endif
}
return max_low_pfn;
}
- 这里分两种情况进行讨论,一个是内存大于896MB,一个是内存小于896MB
- max_low_pfn > MAXMEM_PFN下的#ifndef CONFIG_HIGHMEM会设置max_pfn = MAXMEM_PFN;看出如果不开启highmem,即使内存大于896MB,也只能使用896MB