linux 伙伴系统学习

Linux 内存结构划分

  1. 内存节点,每个节点关联到一个处理器,用 pg_data_t结构表示
  2. 节点又进一步划分成内存域,用struct zone表示,同一种zone里面管理同种性质的 page
  3. page是系统内存的最小单位,系统内存中的每个页面都会建立一个page实例

主要数据结构和主要结构成员

struct pglist_data

结构成员 描述
node_zones[MAX_NR_ZONES] 该node下的zone
node_zonelists[MAX_ZONELISTS] 分配操作都在zonelists进行,UMA只有FALLBACK,NUMA还有 NOFALLBACK
nr_zones 该node下zone的数量
node_start_pfn 该node的起始页帧
node_present_pages 该node的页帧数量,不包括空洞
node_spanned_pages 该node的页帧数量,包括空洞
node_id node的id 号
kswapd_wait kswapd等待队列,每个node有个负责回收LRU链表的kswapd内核线程
struct task_struct* kswapd kswapd内核线程task_struct指针
kswapd_order 记录kswapd回收时的page block大小
kswapd_classzone_idx 记录kswapd回收时在那个zone上,DMA,NORMALdeng
kcompactd_max_order 内存规整时的order
kcompactd_classzone_idx 内存规整时发生在那个zone上
kcompactd_wait kcompactd的等待队列
kcompactd kcompactd的task_struct
totalreserve_pages 该node保留的page数
lruvec lru 链表
vm_stat[NR_VM_NODE_STAT_ITEMS] node状态

struct zone

结构成员 描述
watermark[NR_WMARK] watermark代表该zone的内存压力状态
nr_reserved_highatomic 为highatomic保留的page数量
lowmem_reserve[MAX_NR_ZONES] 每个zone保留的page数量
zone_pgdat 指向该zone所属的pglist_data
pageset per_cpu_pageset变量,单个page从这里分配
pageblock_flags bitmap,大小为pageblock_order(order=10) page block的migratetype
zone_start_pfn 起始页帧号
spanned_pages zone_end_pfn - zone_start_pfn (including holes)
present_pages spanned_pages - absent_pages(pages in holes)
managed_pages present_pages - reserved_pages
free_area[MAX_ORDER] 空闲链表
vm_stat[NR_VM_ZONE_STAT_ITEMS] 该zone的各种状态

分配器核心函数

__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,struct zonelist *zonelist, nodemask_t *nodemask)
这是分配器的核心函数

参数

参数 意义
gfp_mask 有调用这传递, 该mask影响分配器行为
order 连续page格式 2^order
zonelist 2种情况ZONELIST_FALLBACK,ZONELIST_NOFALLBACK
nodemask 可以从那些node分配

zonelist = node_zonelist(nid, gfp_mask)

static inline struct zonelist *node_zonelist(int nid, gfp_t flags)
{
	return NODE_DATA(nid)->node_zonelists + gfp_zonelist(flags);
}
static inline int gfp_zonelist(gfp_t flags)
{
#ifdef CONFIG_NUMA
	if (unlikely(flags & __GFP_THISNODE))
		return ZONELIST_NOFALLBACK;
#endif
	return ZONELIST_FALLBACK;
}

可以看到zonelist的选择是根据gfp_mask来选择的,如果没有 指定__GFP_THISNODE,那就选择ZONELIST_FALLBACK。

函数主要部分注释

struct page *
__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
			struct zonelist *zonelist, nodemask_t *nodemask)
{
	struct page *page;
	unsigned int alloc_flags = ALLOC_WMARK_LOW; //设置为低水位以
	gfp_t alloc_mask = gfp_mask; /* The gfp_t that was actually used for allocation */
	// alloc_mask在函数执行过程中会修改,gfp_mask 在执行过程中不能被修改
	struct alloc_context ac = {
		.high_zoneidx = gfp_zone(gfp_mask), //从gfp_mask提取出可以从最高的zone index,从high->normal->dma顺序,只在初始化的时候设置一次
		.zonelist = zonelist, //传入的zonelist,在这个zonelist上执行分配操作,只在初始化的时候设置一次
		.nodemask = nodemask,//可以从那些node分配,对numa而言,只在初始化的时候设置一次
		.migratetype = gfpflags_to_migratetype(gfp_mask), //从那种migratetype里分配,第一次为fastpath,以后根据情况会调整
	};
	/* Dirty zone balancing only done in the fast path */
	ac.spread_dirty_pages = (gfp_mask & __GFP_WRITE);

	/*
	 * The preferred zone is used for statistics but crucially it is
	 * also used as the starting point for the zonelist iterator. It
	 * may get reset for allocations that ignore memory policies.
	 */
	 //获取第一次分配是从那个zone开始
	ac.preferred_zoneref = first_zones_zonelist(ac.zonelist,
					ac.high_zoneidx, ac.nodemask);
	//如果未获得preferred_zoneref这个和cpuset相关
	if (!ac.preferred_zoneref->zone) {
		page = NULL;
		/*
		 * This might be due to race with cpuset_current_mems_allowed
		 * update, so make sure we retry with original nodemask in the
		 * slow path.
		 */
		goto no_zone;
	}
	/* First allocation attempt */
	//开始第一次分配
	page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);
	if (likely(page)) //如果获得page block则分配成功
		goto out;

no_zone:
	/*
	 * Runtime PM, block IO and its error handling path can deadlock
	 * because I/O on the device might not complete.
	 */
	alloc_mask = memalloc_noio_flags(gfp_mask);
	ac.spread_dirty_pages = false;

	/*
	 * Restore the original nodemask if it was potentially replaced with
	 * &cpuset_current_mems_allowed to optimize the fast-path attempt.
	 */
	if (unlikely(ac.nodemask != nodemask)) //NUMA
		ac.nodemask = nodemask;
   //如果不能快速分配page block,则进入慢速分配路径
	page = __alloc_pages_slowpath(alloc_mask, order, &ac);

out:
	if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page &&
	    unlikely(memcg_kmem_charge(page, gfp_mask, order) != 0)) {
		__free_pages(page, order);
		page = NULL;
	}

	if (kmemcheck_enabled && page) //是否需要检查
		kmemcheck_pagealloc_alloc(page, order, gfp_mask);

	trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype);

	return page; //返回分配结果
}

从代码上看,分配器主要有两条分配路径一条是快速分配路径,另一条是慢速路径,快速路径是在low水位线以上进行的分配,如不满足low水位线,或者从快速路径分配失败则进入慢速路径,最后都会从get_page_from_freelist分配页面
get_page_from_freelist,先来看看流程成图
linux 伙伴系统学习

static struct page *
get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
						const struct alloc_context *ac)
{
	struct zoneref *z = ac->preferred_zoneref;
	struct zone *zone;
	struct pglist_data *last_pgdat_dirty_limit = NULL;

	/*
	 * Scan zonelist, looking for a zone with enough free.
	 * See also __cpuset_node_allowed() comment in kernel/cpuset.c.
	 */
	 //遍历high_zoneidx以下的zonelist
	for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
								ac->nodemask) {
		struct page *page;
		unsigned long mark;

		if (cpusets_enabled() &&
			(alloc_flags & ALLOC_CPUSET) &&
			!__cpuset_zone_allowed(zone, gfp_mask))
				continue; //cpuset相关
		/*
		 * When allocating a page cache page for writing, we
		 * want to get it from a node that is within its dirty
		 * limit, such that no single node holds more than its
		 * proportional share of globally allowed dirty pages.
		 * The dirty limits take into account the node's
		 * lowmem reserves and high watermark so that kswapd
		 * should be able to balance it without having to
		 * write pages from its LRU list.
		 *
		 * XXX: For now, allow allocations to potentially
		 * exceed the per-node dirty limit in the slowpath
		 * (spread_dirty_pages unset) before going into reclaim,
		 * which is important when on a NUMA setup the allowed
		 * nodes are together not big enough to reach the
		 * global limit.  The proper fix for these situations
		 * will require awareness of nodes in the
		 * dirty-throttling and the flusher threads.
		 */
		if (ac->spread_dirty_pages) {
			if (last_pgdat_dirty_limit == zone->zone_pgdat)
				continue;

			if (!node_dirty_ok(zone->zone_pgdat)) {
				last_pgdat_dirty_limit = zone->zone_pgdat;
				continue;
			}
		}
		//获取水位线
		mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
		//判断是否满足水位线要求
		if (!zone_watermark_fast(zone, order, mark,
				       ac_classzone_idx(ac), alloc_flags)) {
			int ret;

			/* Checked here to keep the fast path fast */
			BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
			if (alloc_flags & ALLOC_NO_WATERMARKS)  //未达水位线要求但是可以忽略水位线,则可以继续分配
				goto try_this_zone;

			if (node_reclaim_mode == 0 ||  //NUMA
			    !zone_allows_reclaim(ac->preferred_zoneref->zone, zone))
				continue;

			ret = node_reclaim(zone->zone_pgdat, gfp_mask, order); //NUMA
			switch (ret) {
			case NODE_RECLAIM_NOSCAN:
				/* did not scan */
				continue;
			case NODE_RECLAIM_FULL:
				/* scanned but unreclaimable */
				continue;
			default:
				/* did we reclaim enough */
				if (zone_watermark_ok(zone, order, mark,
						ac_classzone_idx(ac), alloc_flags))
					goto try_this_zone;

				continue; //若果该zone在水位线以下,则检查下一个zone
			}
		}

try_this_zone:
		//从伙伴系统里分配page block
		page = buffered_rmqueue(ac->preferred_zoneref->zone, zone, order,
				gfp_mask, alloc_flags, ac->migratetype);
		if (page) {
			prep_new_page(page, order, gfp_mask, alloc_flags);

			/*
			 * If this is a high-order atomic allocation then check
			 * if the pageblock should be reserved for the future
			 */
			//如果 ALLOC_HARDER,则将page所在block 标记为highatomic,1024个page
			if (unlikely(order && (alloc_flags & ALLOC_HARDER)))
				reserve_highatomic_pageblock(page, zone, order);

			return page;
		}
	}

	return NULL;
}

zone_watermark_fast 检查水位线

static inline bool zone_watermark_fast(struct zone *z, unsigned int order,
		unsigned long mark, int classzone_idx, unsigned int alloc_flags)
{
	long free_pages = zone_page_state(z, NR_FREE_PAGES); //该zone的free page数
	long cma_pages = 0;

#ifdef CONFIG_CMA
	/* If allocation can't use CMA areas don't use free CMA pages */
	if (!(alloc_flags & ALLOC_CMA)) //如果alloc_flags没有ALLOC_CMA,所以计算 CMA page
		cma_pages = zone_page_state(z, NR_FREE_CMA_PAGES);
#endif

	/*
	 * Fast check for order-0 only. If this fails then the reserves
	 * need to be calculated. There is a corner case where the check
	 * passes but only the high-order atomic reserve are free. If
	 * the caller is !atomic then it'll uselessly search the free
	 * list. That corner case is then slower but it is harmless.
	 */
	 //快速检查,order=0的case,free page数减去cma数,大于保留的page数加水位线
	 //水位线检查就是ok的,可以快速分配到page
	if (!order && (free_pages - cma_pages) > mark + z->lowmem_reserve[classzone_idx])
		return true;
	//如果order为0的水位线检查失败,或者高阶分配则进行如下检查
	return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
					free_pages);
}

bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
			 int classzone_idx, unsigned int alloc_flags,
			 long free_pages)
{
	long min = mark; //水位线
	int o;
	const bool alloc_harder = (alloc_flags & ALLOC_HARDER);

	/* free_pages may go negative - that's OK */
	free_pages -= (1 << order) - 1; //free page数减去需要分配page数

	if (alloc_flags & ALLOC_HIGH) //ALLOC_HIGH,水位线减小为原来1/2
		min -= min / 2;

	/*
	 * If the caller does not have rights to ALLOC_HARDER then subtract
	 * the high-atomic reserves. This will over-estimate the size of the
	 * atomic reserve but it avoids a search.
	 */
	if (likely(!alloc_harder)) //如果没有ALLOC_HARDER,则不能分配reserved_highatomic的page
		free_pages -= z->nr_reserved_highatomic;
	else //如果有ALLOC_HARDER,水位线再减少1/4
		min -= min / 4;

#ifdef CONFIG_CMA
	/* If allocation can't use CMA areas don't use free CMA pages */
	if (!(alloc_flags & ALLOC_CMA)) // 如果ALLOC_CMA,free page减去CMA_PAGES
		free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES);
#endif

	/*
	 * Check watermarks for an order-0 allocation request. If these
	 * are not met, then a high-order request also cannot go ahead
	 * even if a suitable page happened to be free.
	 */
	 //free数如果小于水位线加保留的供紧急分配的page数则水位线检查失败
	if (free_pages <= min + z->lowmem_reserve[classzone_idx])
		return false;

	/* If this is an order-0 request then the watermark is fine */
	if (!order) //order-0 水位线ok
		return true;

	/* For a high-order request, check at least one suitable page is free */
	//高阶水位线检查
	for (o = order; o < MAX_ORDER; o++) {
		struct free_area *area = &z->free_area[o];
		int mt;

		if (!area->nr_free)//跳过page数为0的free_area 
			continue;

		if (alloc_harder) //ALLOC_HARDER的分配忽略MIGRATE TYPES
			return true;
	    //其余的检查对应的MIGRATE区域
		for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) {
			if (!list_empty(&area->free_list[mt]))
				return true;
		}

#ifdef CONFIG_CMA
		if ((alloc_flags & ALLOC_CMA) &&
		    !list_empty(&area->free_list[MIGRATE_CMA])) {
			return true;
		}
#endif
	}
	return false;
}

从具体的zone里分配page:buffered_rmqueue

/*
 * Allocate a page from the given zone. Use pcplists for order-0 allocations.
 */
static inline
struct page *buffered_rmqueue(struct zone *preferred_zone,
			struct zone *zone, unsigned int order,
			gfp_t gfp_flags, unsigned int alloc_flags,
			int migratetype)
{
	unsigned long flags;
	struct page *page;
	bool cold = ((gfp_flags & __GFP_COLD) != 0); //分配热页还是冷页

	if (likely(order == 0)) { //单个页面从pcplists来分配
		struct per_cpu_pages *pcp;
		struct list_head *list;

		local_irq_save(flags); //关中断,分配过程不能被打断
		do {
			pcp = &this_cpu_ptr(zone->pageset)->pcp; //per cpu变量
			list = &pcp->lists[migratetype]; //对应的migratetype
			if (list_empty(list)) { //如果对应的migratetype,list为空,则先获取pcp->batch到对应的migratetype list里
				pcp->count += rmqueue_bulk(zone, 0,
						pcp->batch, list,
						migratetype, cold);
				if (unlikely(list_empty(list)))
					goto failed;
			}

			if (cold)
				page = list_last_entry(list, struct page, lru); //如果分配冷页,从链表的尾部分配
			else
				page = list_first_entry(list, struct page, lru); //如果分配热页从链表的头部分配

			list_del(&page->lru); //将page从链表删除
			pcp->count--; //计数减少

		} while (check_new_pcp(page));
	} else {
		/*
		 * We most definitely don't want callers attempting to
		 * allocate greater than order-1 page units with __GFP_NOFAIL.
		 */
		WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
		spin_lock_irqsave(&zone->lock, flags);

		do {
			page = NULL;
			if (alloc_flags & ALLOC_HARDER) { //ALLOC_HARDER从MIGRATE_HIGHATOMIC区域分配
				page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
				if (page)
					trace_mm_page_alloc_zone_locked(page, order, migratetype);
			}
			if (!page) //从MIGRATE_HIGHATOMIC区域分配分配失败或者不是ALLOC_HARDER的分配调用__rmqueue
				page = __rmqueue(zone, order, migratetype);
		} while (page && check_new_pages(page, order));
		spin_unlock(&zone->lock);
		if (!page)
			goto failed;
		__mod_zone_freepage_state(zone, -(1 << order),
					  get_pcppage_migratetype(page));
	}

	__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
	zone_statistics(preferred_zone, zone, gfp_flags);
	local_irq_restore(flags);//开中断

	VM_BUG_ON_PAGE(bad_range(zone, page), page);
	return page;

failed:
	local_irq_restore(flags); //开中断
	return NULL;
}

从指定的migratetype里分配page

static struct page *__rmqueue(struct zone *zone, unsigned int order,
				int migratetype)
{
	struct page *page;
	//先从指定的migratetype区分配
	page = __rmqueue_smallest(zone, order, migratetype);
	if (unlikely(!page)) {
		if (migratetype == MIGRATE_MOVABLE)
			page = __rmqueue_cma_fallback(zone, order);

		if (!page) //如果指定的migratetype区无法分配,则从fallback分配
			page = __rmqueue_fallback(zone, order, migratetype);
	}

	trace_mm_page_alloc_zone_locked(page, order, migratetype);
	return page;
}
static inline
struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
						int migratetype)
{
	unsigned int current_order;
	struct free_area *area;
	struct page *page;

	/* Find a page of the appropriate size in the preferred list */
	//遍历从指定的order开始,到MAX_ORDER
	for (current_order = order; current_order < MAX_ORDER; ++current_order) {
		area = &(zone->free_area[current_order]);
		//遍历链表area->free_list[migratetype],查找第一个合适page block
		page = list_first_entry_or_null(&area->free_list[migratetype],
							struct page, lru);
		if (!page) //如果在该area->free_list[migratetype]链表上没有合适page block,则查看下一个order
			continue;
		list_del(&page->lru); //如果找到合适page block,则从链表删除
		rmv_page_order(page); //清除伙伴系统标志,清除page->private = order
		area->nr_free--; //该order的 page block数量减1
		//切割page block,将剩余的page block返回伙伴系统
		expand(zone, page, order, current_order, area, migratetype);
		set_pcppage_migratetype(page, migratetype);
		return page;
	}

	return NULL;
}
//low是指定分配的order。high是current_order
//area是当前的current_order的
static inline void expand(struct zone *zone, struct page *page,
	int low, int high, struct free_area *area,
	int migratetype)
{
	unsigned long size = 1 << high; //当前page block的大小

	while (high > low) { //当前的order大于指定的order
		area--;//current_order的下一个area
		high--;//下一个order
		size >>= 1; //二分
		VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]);

		/*
		 * Mark as guard pages (or page), that will allow to
		 * merge back to allocator when buddy will be freed.
		 * Corresponding page table entries will not be touched,
		 * pages will stay not present in virtual address space
		 */
		if (set_page_guard(zone, &page[size], high, migratetype))
			continue;

		list_add(&page[size].lru, &area->free_list[migratetype]); //将后半部分的page block挂入对应的area
		area->nr_free++; //对应的area里的空闲page block数增加
		set_page_order(&page[size], high);
	}
}
在对应的migratetype无法满足分配的时候从fallback里分配
/* Remove an element from the buddy allocator from the fallback list */
static inline struct page *
__rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
{
	struct free_area *area;
	unsigned int current_order;
	struct page *page;
	int fallback_mt;
	bool can_steal;

	/* Find the largest possible block of pages in the other list */
	//尽可能选取最大的page block,为了减少碎片从fallback里分配,会将相应的pageblock,标记为新的migratetype,
	for (current_order = MAX_ORDER-1;
				current_order >= order && current_order <= MAX_ORDER-1;
				--current_order) {
		area = &(zone->free_area[current_order]);
		//查找合适的fallback_mt,并判断是否能steal
		fallback_mt = find_suitable_fallback(area, current_order,
				start_migratetype, false, &can_steal);
		if (fallback_mt == -1)
			continue;
		//从area->free_list[fallback_mt]获得page block
		page = list_first_entry(&area->free_list[fallback_mt],
						struct page, lru);
		if (can_steal) //如果能steal,则将该page bock标记为start_migratetype
			steal_suitable_fallback(zone, page, start_migratetype);

		/* Remove the page from the freelists */
		area->nr_free--;
		list_del(&page->lru); //将该page block从原来list删除
		rmv_page_order(page); //清除伙伴系统标志,清除page->private = order
		//将剩余的page block挂入start_migratetype链表
		expand(zone, page, order, current_order, area,
					start_migratetype);
		/*
		 * The pcppage_migratetype may differ from pageblock's
		 * migratetype depending on the decisions in
		 * find_suitable_fallback(). This is OK as long as it does not
		 * differ for MIGRATE_CMA pageblocks. Those can be used as
		 * fallback only via special __rmqueue_cma_fallback() function
		 */
		set_pcppage_migratetype(page, start_migratetype);

		trace_mm_page_alloc_extfrag(page, order, current_order,
			start_migratetype, fallback_mt);

		return page;
	}

	return NULL;
}
/*
 * Check whether there is a suitable fallback freepage with requested order.
 * If only_stealable is true, this function returns fallback_mt only if
 * we can steal other freepages all together. This would help to reduce
 * fragmentation due to mixed migratetype pages in one pageblock.
 */
int find_suitable_fallback(struct free_area *area, unsigned int order,
			int migratetype, bool only_stealable, bool *can_steal)
{
	int i;
	int fallback_mt;

	if (area->nr_free == 0) //如果该area没有free page,直接返回
		return -1;

	*can_steal = false;
	for (i = 0;; i++) { //fallbacks的migratetype
		fallback_mt = fallbacks[migratetype][i];
		if (fallback_mt == MIGRATE_TYPES) //该migratetype的fallback_mt 遍历完成
			break;

		if (list_empty(&area->free_list[fallback_mt])) //该fallback_mt 的free_list为空
			continue;

		if (can_steal_fallback(order, migratetype)) //能否steal
			*can_steal = true;

		if (!only_stealable) //可以返回不能steal的fallback_mt
			return fallback_mt;

		if (*can_steal)//只返回能steal的fallback_mt
			return fallback_mt;
	}

	return -1;
}


/*
 * This array describes the order lists are fallen back to when
 * the free lists for the desirable migrate type are depleted
 */
 //各MIGRATE type的fallbacks
static int fallbacks[MIGRATE_TYPES][4] = {
	[MIGRATE_UNMOVABLE]   = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE,   MIGRATE_TYPES },
	[MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE,   MIGRATE_MOVABLE,   MIGRATE_TYPES },
	[MIGRATE_MOVABLE]     = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES },
#ifdef CONFIG_CMA
	[MIGRATE_CMA]         = { MIGRATE_TYPES }, /* Never used */
#endif
#ifdef CONFIG_MEMORY_ISOLATION
	[MIGRATE_ISOLATE]     = { MIGRATE_TYPES }, /* Never used */
#endif
};
static bool can_steal_fallback(unsigned int order, int start_mt)
{
	/*
	 * Leaving this order check is intended, although there is
	 * relaxed order check in next check. The reason is that
	 * we can actually steal whole pageblock if this condition met,
	 * but, below check doesn't guarantee it and that is just heuristic
	 * so could be changed anytime.
	 */
	if (order >= pageblock_order) //该块的大小是大于order=10,可以steal
		return true;
	// //该块的order大小大于5,RECLAIMABLE,UNMOVABLE,或者禁止分组移动都可以steal
	if (order >= pageblock_order / 2 ||
		start_mt == MIGRATE_RECLAIMABLE || 
		start_mt == MIGRATE_UNMOVABLE ||   
		page_group_by_mobility_disabled)
		return true; 

	return false;
}
static void steal_suitable_fallback(struct zone *zone, struct page *page,
							  int start_type)
{
	unsigned int current_order = page_order(page); //当前page block的 order
	int pages;

	/* Take ownership for orders >= pageblock_order */
	//如果page block的order大于等于pageblock_order(10)
	if (current_order >= pageblock_order) { 
		//将整块的page block都设置为start_type
		change_pageblock_range(page, current_order, start_type);
		return;
	}
	//如果page block的order小于pageblock_order,先将free page挂入start_type的free list中
	pages = move_freepages_block(zone, page, start_type);

	/* Claim the whole block if over half of it is free */
	//如果pages大于等于2^9,将该page block标记为start_type
	if (pages >= (1 << (pageblock_order-1)) ||
			page_group_by_mobility_disabled)
		set_pageblock_migratetype(page, start_type);
}
//move_freepages_block和set_pageblock_migratetype的作用
//move_freepages_block只是将page block挂在了start_type的链表上
//但是migratetype并没有改变,freepage的时候又返回到原来的migratetype的链表上
//set_pageblock_migratetype就是彻底改变了migratetype,释放的时候返回到了新的
//migratetype链表上

慢速路径分配

先来看看流程图。linux 伙伴系统学习在这里插入图片描述
__alloc_pages_slowpath](https://img-blog.csdnimg.cn/20190224112608441.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl80MjA1NjYzNQ==,size_16,color_FFFFFF,t_70)


static inline struct page *
__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
						struct alloc_context *ac)
{
	bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;//gpf_mask里是否只定了可以直接回收page
	struct page *page = NULL;
	unsigned int alloc_flags;
	unsigned long did_some_progress;
	enum compact_priority compact_priority;
	enum compact_result compact_result;
	int compaction_retries;
	int no_progress_loops;
	unsigned long alloc_start = jiffies;
	unsigned int stall_timeout = 10 * HZ;
	unsigned int cpuset_mems_cookie;

	/*
	 * In the slowpath, we sanity check order to avoid ever trying to
	 * reclaim >= MAX_ORDER areas which will never succeed. Callers may
	 * be using allocators in order of preference for an area that is
	 * too large.
	 */
	if (order >= MAX_ORDER) { //如果分配的order大于最大的order,则分配失败
		WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
		return NULL;
	}

	/*
	 * We also sanity check to catch abuse of atomic reserves being used by
	 * callers that are not in atomic context.
	 */
	if (WARN_ON_ONCE((gfp_mask & (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)) ==
				(__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))
		gfp_mask &= ~__GFP_ATOMIC; //原子分配不能被打断

retry_cpuset:
	compaction_retries = 0;
	no_progress_loops = 0;
	compact_priority = DEF_COMPACT_PRIORITY;
	cpuset_mems_cookie = read_mems_allowed_begin();
	/*
	 * We need to recalculate the starting point for the zonelist iterator
	 * because we might have used different nodemask in the fast path, or
	 * there was a cpuset modification and we are retrying - otherwise we
	 * could end up iterating over non-eligible zones endlessly.
	 */
	 //nodemask和cpuset 的改变可能会改变preferred_zoneref ,但是在uma系统,cpu set禁止的情况下,preferred_zoneref 不会改变
	ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
					ac->high_zoneidx, ac->nodemask);
	if (!ac->preferred_zoneref->zone) //如果没有找到从那个zone开始分配则分配失败
		goto nopage;


	/*
	 * The fast path uses conservative alloc_flags to succeed only until
	 * kswapd needs to be woken up, and to avoid the cost of setting up
	 * alloc_flags precisely. So we do that now.
	 */
	 //快速路径只是查看了水位线,现在进入慢速路径,要仔细查看alloc_flags,
	alloc_flags = gfp_to_alloc_flags(gfp_mask);

	if (gfp_mask & __GFP_KSWAPD_RECLAIM) //如果指定可以唤醒kswapd,则唤醒kswapd
		wake_all_kswapds(order, ac);

	/*
	 * The adjusted alloc_flags might result in immediate success, so try
	 * that first
	 */
	 //调整了alloc_flags 再次执行分配
	page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
	if (page) //如果分配成功则返回page
		goto got_pg;

	/*
	 * For costly allocations, try direct compaction first, as it's likely
	 * that we have enough base pages and don't need to reclaim. Don't try
	 * that for allocations that are allowed to ignore watermarks, as the
	 * ALLOC_NO_WATERMARKS attempt didn't yet happen.
	 */
	 //如果是高阶(大于3)分配,先直接执行压缩规整,因为可能有大量基础page,压缩规
	 //整可以合成高阶page
	if (can_direct_reclaim && order > PAGE_ALLOC_COSTLY_ORDER &&
		!gfp_pfmemalloc_allowed(gfp_mask)) {
		page = __alloc_pages_direct_compact(gfp_mask, order,
						alloc_flags, ac,
						INIT_COMPACT_PRIORITY,
						&compact_result);
		if (page) //如果获得page则分会page
			goto got_pg;

		/*
		 * Checks for costly allocations with __GFP_NORETRY, which
		 * includes THP page fault allocations
		 */
		if (gfp_mask & __GFP_NORETRY) { //如果压缩规整不能分配到page,且指明不需要retry
			/*
			 * If compaction is deferred for high-order alloations,
			 * it is because sync compaction recently failed. If
			 * this is the case and the caller requested a THP
			 * allocation, we do not want to heavily disrupt the
			 * system, so we fail the allocation instead of entering
			 * direct reclaim.
			 */
			if (compact_result == COMPACT_DEFERRED) //压缩规整失败
				goto nopage; //无法分配到page

			/*
			 * Looks like reclaim/compaction is worth trying, but
			 * sync compaction could be very expensive, so keep
			 * using async compaction.
			 */
			compact_priority = INIT_COMPACT_PRIORITY; //异步模式
		}
	}

retry:
	/* Ensure kswapd doesn't accidentally go to sleep as long as we loop */
	//确保kswapd不会因为意外进入睡眠
	if (gfp_mask & __GFP_KSWAPD_RECLAIM)
		wake_all_kswapds(order, ac);

	if (gfp_pfmemalloc_allowed(gfp_mask)) //如果有PF_MEMALLOC标志
		alloc_flags = ALLOC_NO_WATERMARKS; //则允许忽略水位线

	/*
	 * Reset the zonelist iterators if memory policies can be ignored.
	 * These allocations are high priority and system rather than user
	 * orientated.
	 */
	 //如果允许忽略水位线,则重新设置ac->zonelist,ac->preferred_zoneref 
	if (!(alloc_flags & ALLOC_CPUSET) || (alloc_flags & ALLOC_NO_WATERMARKS)) {
		ac->zonelist = node_zonelist(numa_node_id(), gfp_mask);
		ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
					ac->high_zoneidx, ac->nodemask);
	}

	/* Attempt with potentially adjusted zonelist and alloc_flags */
	//尝试使用可能已调整的zonelist和alloc_flags,再次分配
	page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
	if (page) //如果分配到page,则返回
		goto got_pg;

	/* Caller is not willing to reclaim, we can't balance anything */
	if (!can_direct_reclaim) { //如果还是没有分配到page,但是调用者还不能直接回收内存
		/*
		 * All existing users of the __GFP_NOFAIL are blockable, so warn
		 * of any new users that actually allow this type of allocation
		 * to fail.
		 */
		WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL);
		goto nopage; //则分配失败
	}

	/* Avoid recursion of direct reclaim */
	if (current->flags & PF_MEMALLOC) { //如果当前进程指定了PF_MEMALLOC,PF_MEMALLOC表示可以使用紧急预留内存,但是很快会释放别的内存,
		/*
		 * __GFP_NOFAIL request from this context is rather bizarre
		 * because we cannot reclaim anything and only can loop waiting
		 * for somebody to do a work for us.
		 */
		if (WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) { //如果指定了不能分配失败
			cond_resched(); //发起一次调度,可能别的进程会释放内存,避免直接递归,
			goto retry; //重新再来分配
		}
		goto nopage; //如果没有指定 __GFP_NOFAIL,则分配失败
	}

	/* Avoid allocations with no watermarks from looping endlessly */
	//避免忽略水位无休止分配
	if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
		goto nopage;


	/* Try direct reclaim and then allocating */
	//尝试直接回收
	page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
							&did_some_progress);
	if (page) //如果分配到则返回page
		goto got_pg;

	/* Try direct compaction and then allocating */
	//如果直接回收失败,尝试直接压缩规整
	page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,
					compact_priority, &compact_result);
	if (page)//如果分配到则返回page
		goto got_pg;

	/* Do not loop if specifically requested */
	if (gfp_mask & __GFP_NORETRY) //如果直接压缩规整失败,且指定__GFP_NORETRY
		goto nopage; //返回失败

	/*
	 * Do not retry costly high order allocations unless they are
	 * __GFP_REPEAT
	 */
	 //如果是高阶(>3),且没有指定__GFP_REPEAT
	if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT))
		goto nopage; //返回失败

	/* Make sure we know about allocations which stall for too long */
	//确定定停顿时间
	if (time_after(jiffies, alloc_start + stall_timeout)) {
		warn_alloc(gfp_mask,
			"page allocation stalls for %ums, order:%u",
			jiffies_to_msecs(jiffies-alloc_start), order);
		stall_timeout += 10 * HZ;
	}
	//确认是否需要再次尝试
	if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
				 did_some_progress > 0, &no_progress_loops))
		goto retry; //如果需要,则重新尝试分配

	/*
	 * It doesn't make any sense to retry for the compaction if the order-0
	 * reclaim is not able to make any progress because the current
	 * implementation of the compaction depends on the sufficient amount
	 * of free memory (see __compaction_suitable)
	 */
	 如果回收到一些内存,而且需要压缩规整则重新尝试分配
	if (did_some_progress > 0 &&
			should_compact_retry(ac, order, alloc_flags,
				compact_result, &compact_priority,
				&compaction_retries))
		goto retry;

	/*
	 * It's possible we raced with cpuset update so the OOM would be
	 * premature (see below the nopage: label for full explanation).
	 */
	if (read_mems_allowed_retry(cpuset_mems_cookie))
		goto retry_cpuset;

	/* Reclaim has failed us, start killing things */
	//如果回收还是失败,则需要杀进程来释放内存,再次进行分配
	page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
	if (page) //如果分配成功则直接返回,
		goto got_pg;

	/* Retry as long as the OOM killer is making progress */
	//如果杀进程还不能满足分配,当时还是回收到一些内存,则需要重新尝试
	if (did_some_progress) {
		no_progress_loops = 0;
		goto retry;
	}

nopage:
	/*
	 * When updating a task's mems_allowed or mempolicy nodemask, it is
	 * possible to race with parallel threads in such a way that our
	 * allocation can fail while the mask is being updated. If we are about
	 * to fail, check if the cpuset changed during allocation and if so,
	 * retry.
	 */
	if (read_mems_allowed_retry(cpuset_mems_cookie))
		goto retry_cpuset;

	warn_alloc(gfp_mask,
			"page allocation failure: order:%u", order);
got_pg:
	return page;
}
调整alloc_flags:
在快速分配的时候只是检查了是否满足low水位线。
进入慢速分配的时候,除了需要将水位调整为min外,还需要检查分配请求的优先级
static inline unsigned int
gfp_to_alloc_flags(gfp_t gfp_mask)
{
	//水位线调整为min
	unsigned int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;

	/* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
	BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);

	/*
	 * The caller may dip into page reserves a bit more if the caller
	 * cannot run direct reclaim, or if the caller has realtime scheduling
	 * policy or is asking for __GFP_HIGH memory.  GFP_ATOMIC requests will
	 * set both ALLOC_HARDER (__GFP_ATOMIC) and ALLOC_HIGH (__GFP_HIGH).
	 */
	alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH); //指定__GFP_HIGH

	if (gfp_mask & __GFP_ATOMIC) { //如果是__GFP_ATOMIC
		/*
		 * Not worth trying to allocate harder for __GFP_NOMEMALLOC even
		 * if it can't schedule.
		 */
		if (!(gfp_mask & __GFP_NOMEMALLOC)) //如果没有指定__GFP_NOMEMALLOC
			alloc_flags |= ALLOC_HARDER; //指定ALLOC_HARDER
		/*
		 * Ignore cpuset mems for GFP_ATOMIC rather than fail, see the
		 * comment for __cpuset_node_allowed().
		 */
		alloc_flags &= ~ALLOC_CPUSET; //去掉ALLOC_CPUSET,原子分配不能失败
	} else if (unlikely(rt_task(current)) && !in_interrupt()) //适时进程指定ALLOC_HARDER
		alloc_flags |= ALLOC_HARDER;

#ifdef CONFIG_CMA
	if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
		alloc_flags |= ALLOC_CMA;
#endif
	return alloc_flags;
}

* __GFP_HIGH表示调用者具有高优先级,在系统能够向前推进之前,必须授予请求。例如,创建一个IO上下文来清理页面。
* __GFP_ATOMIC表示调用方不能回收或休眠,是高优先级的。用户通常是中断处理程序。这可以与__GFP_HIGH一起使用
* __GFP_MEMALLOC允许访问所有内存。只有当调用方保证分配将允许在很短的时间内释放更多内存时,才应该使用这种方法,例如进程退出或交换。用户应该是MM或者与VM紧密协作(例如通过NFS进行交换)。
* __GFP_NOMEMALLOC用于显式禁止访问应急储备。如果两者都设置了,则优先于__GFP_MEMALLOC标志。

#define GFP_ATOMIC	(__GFP_HIGH|__GFP_ATOMIC|__GFP_KSWAPD_RECLAIM)
GFP_ATOMIC调用者不能休眠必须分配成功。使用较低水被应水位线于允许访问应急储备内存
#define GFP_KERNEL	(__GFP_RECLAIM | __GFP_IO | __GFP_FS)
GFP_KERNEL是内核内部典型分配的典型。调用者需要ZONE_NORMAL或更低的区域进行直接访问,但可以直接回收
#define GFP_KERNEL_ACCOUNT (GFP_KERNEL | __GFP_ACCOUNT)
GFP_KERNEL_ACCOUNT与GFP_KERNEL相同,不同之处是分配被计入kmemcg。
#define GFP_NOWAIT	(__GFP_KSWAPD_RECLAIM)
GFP_NOWAIT用于内核分配,不应该因为直接回收、启动物理IO或使用任何文件系统回调而暂停。
#define GFP_NOIO	(__GFP_RECLAIM)
GFP_NOIO不需要启动任何物理IO,将使用直接回收来 丢弃的干净页面或slab页面。
#define GFP_NOFS	(__GFP_RECLAIM | __GFP_IO)
GFP_NOFS将使用直接回收,但不会使用任何文件系统接口。

判断是否不要retry
//did_some_progress:代表在上一轮回收期间的任何进展
//no_progress_loops:没有任何进展的连续回收的轮数
//no_progress_loops超过16次,返回失败,
//ac->high_zoneidx以下的zone,可回收的page数+free的page数在min水位线以上,可以再次尝试回收
static inline bool
should_reclaim_retry(gfp_t gfp_mask, unsigned order,
		     struct alloc_context *ac, int alloc_flags,
		     bool did_some_progress, int *no_progress_loops)
{
	struct zone *zone;
	struct zoneref *z;

	/*
	 * Costly allocations might have made a progress but this doesn't mean
	 * their order will become available due to high fragmentation so
	 * always increment the no progress counter for them
	 */
	 //如果上次回收取得进展且order小于3,将没有进展的轮数清0
	if (did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER)
		*no_progress_loops = 0;
	else //如果上一次回收没有取得进展,或者说高阶分配(>3)
		(*no_progress_loops)++; //没有进展的论述增加

	/*
	 * Make sure we converge to OOM if we cannot make any progress
	 * several times in the row.
	 */
	if (*no_progress_loops > MAX_RECLAIM_RETRIES) //连续16次没有取得进展,则不因该再次分配
		return false;

	/*
	 * Keep reclaiming pages while there is a chance this will lead
	 * somewhere.  If none of the target zones can satisfy our allocation
	 * request even if all reclaimable pages are considered then we are
	 * screwed and have to go OOM.
	 */
	 遍历ac->high_zoneidx以下个zone
	for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
					ac->nodemask) {
		unsigned long available;
		unsigned long reclaimable;

		available = reclaimable = zone_reclaimable_pages(zone); //统计可以回收的page数
		//可以回收的数减去(*no_progress_loops) * available / MAX_RECLAIM_RETRIES
		available -= DIV_ROUND_UP((*no_progress_loops) * available,
					  MAX_RECLAIM_RETRIES);
		available += zone_page_state_snapshot(zone, NR_FREE_PAGES); //加free的page数

		/*
		 * Would the allocation succeed if we reclaimed the whole
		 * available?
		 */
		 //检查available是否满足min水位线
		 
		if (__zone_watermark_ok(zone, order, min_wmark_pages(zone),
				ac_classzone_idx(ac), alloc_flags, available)) {
			/*
			 * If we didn't make any progress and have a lot of
			 * dirty + writeback pages then we should wait for
			 * an IO to complete to slow down the reclaim and
			 * prevent from pre mature OOM
			 */
			if (!did_some_progress) { //如果上一轮没有回收到任何page,检查是否有太多写页面挂起
				unsigned long write_pending;

				write_pending = zone_page_state_snapshot(zone,
							NR_ZONE_WRITE_PENDING);

				if (2 * write_pending > reclaimable) { //如果写挂起的page数大于可回收的一半
					congestion_wait(BLK_RW_ASYNC, HZ/10); //发起异步回写操作,
					return true;
				}
			}

			/*
			 * Memory allocation/reclaim might be called from a WQ
			 * context and the current implementation of the WQ
			 * concurrency control doesn't recognize that
			 * a particular WQ is congested if the worker thread is
			 * looping without ever sleeping. Therefore we have to
			 * do a short sleep here rather than calling
			 * cond_resched().
			 */
			if (current->flags & PF_WQ_WORKER) //work quene内核线程需要短暂休眠
				schedule_timeout_uninterruptible(1);
			else //普通线程需要发起调度
				cond_resched();

			return true;
		}
	}

	return false;
}