kswapd 学习一
kswapd 学习一
页面回收
当系统拥有大量空闲内存时,常被进程用来用作缓冲以提高系统性能,但是遇到一些
内存分配伙伴系统无法满足的情况下,就需要回收这部分缓冲来满足系统的内存分配。
回收缓冲有2种方式
- kswapd线程异步回收
- 直接回收
先看kswapd异步回收
kswapd流程图
从图中可以看到kswapd的是一个大的循环,如果zone_balanced,则进入睡眠,在从伙伴系统中无法从快速路径分配到页面时,会唤醒kswapd来进行异步回收
判断kswapd是否可以睡眠
判断kswapd是否可以睡眠的条件主要是看该pgdat下的从0到classzone_idx的zone是否都是balance的。
kswapd_try_to_sleep
static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order,
▸ ▸ ▸ ▸ unsigned int classzone_idx)
{
▸ long remaining = 0;
▸ DEFINE_WAIT(wait);
▸ if (freezing(current) || kthread_should_stop())
▸ ▸ return;
▸ prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); //定义等待队列设置kswapd进程状态
▸ /* Try to sleep for a short interval */
▸ if (prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) { //能否进行睡眠,zone_balaned?
▸ ▸ /*
▸ ▸ * Compaction records what page blocks it recently failed to
▸ ▸ * isolate pages from and skips them in the future scanning.
▸ ▸ * When kswapd is going to sleep, it is reasonable to assume
▸ ▸ * that pages and compaction may succeed so reset the cache.
▸ ▸ */
▸ ▸ reset_isolation_suitable(pgdat);
▸ ▸ /*
▸ ▸ * We have freed the memory, now we should compact it to make
▸ ▸ * allocation of the requested order possible.
▸ ▸ */
▸ ▸ wakeup_kcompactd(pgdat, alloc_order, classzone_idx); //唤醒 kcompactd
▸ ▸ remaining = schedule_timeout(HZ/10); //睡眠100ms
▸ ▸ /*
▸ ▸ * If woken prematurely then reset kswapd_classzone_idx and
▸ ▸ * order. The values will either be from a wakeup request or
▸ ▸ * the previous request that slept prematurely.
▸ ▸ */
▸ ▸ if (remaining) { // remaining 不为0说明100ms内被唤醒
▸ ▸ ▸ pgdat->kswapd_classzone_idx = max(pgdat->kswapd_classzone_idx, classzone_idx);
▸ ▸ ▸ pgdat->kswapd_order = max(pgdat->kswapd_order, reclaim_order);
▸ ▸ }
▸ ▸ finish_wait(&pgdat->kswapd_wait, &wait); // 设置进程状态为running,
▸ ▸ prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
▸ }
▸ /*
▸ * After a short sleep, check if it was a premature sleep. If not, then
▸ * go fully to sleep until explicitly woken up.
▸ */
▸ if (!remaining &&
▸ prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) {
▸ ▸ trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
//如果100ms内没有被唤醒,并且zone_balaned,则可以睡眠
▸ ▸ /*
▸ ▸ * vmstat counters are not perfectly accurate and the shrink_active_list(SWAP_estimated
▸ ▸ * value for counters such as NR_FREE_PAGES can deviate from the
▸ ▸ * true value by nr_online_cpus * threshold. To avoid the zone
▸ ▸ * watermarks being breached while under pressure, we reduce the
▸ ▸ * per-cpu vmstat threshold while kswapd is awake and restore
▸ ▸ * them before going back to sleep.
▸ ▸ */
▸ ▸ set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
▸ ▸ if (!kthread_should_stop())
▸ ▸ ▸ schedule(); //主动释放处理器,调度出去,等待唤醒
▸ ▸ set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
▸ } else { //否则不适合继续睡眠
▸ ▸ if (remaining)
▸ ▸ ▸ count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
▸ ▸ else
▸ ▸ ▸ count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
▸ }
▸ finish_wait(&pgdat->kswapd_wait, &wait); // 设置进程状态为running,
}
static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx)
{
int i;
/*
* The throttled processes are normally woken up in balance_pgdat() as
* soon as pfmemalloc_watermark_ok() is true. But there is a potential
* race between when kswapd checks the watermarks and a process gets
* throttled. There is also a potential race if processes get
* throttled, kswapd wakes, a large process exits thereby balancing the
* zones, which causes kswapd to exit balance_pgdat() before reaching
* the wake up checks. If kswapd is going to sleep, no process should
* be sleeping on pfmemalloc_wait, so wake them now if necessary. If
* the wake up is premature, processes will wake kswapd and get
* throttled again. The difference from wake ups in balance_pgdat() is
* that here we are under prepare_to_wait().
*/
if (waitqueue_active(&pgdat->pfmemalloc_wait))
wake_up_all(&pgdat->pfmemalloc_wait);
for (i = 0; i <= classzone_idx; i++) { //检查从0到classzone_idx,每个zone是否balance,如果有一个zone不是balance的,则不可以睡眠
struct zone *zone = pgdat->node_zones + i;
if (!managed_zone(zone))
continue;
if (!zone_balanced(zone, order, classzone_idx))
return false;
}
return true;
}
判断一个zone是否是balance,就看该zone是否在high水位线以上
static bool zone_balanced(struct zone *zone, int order, int classzone_idx)
{
unsigned long mark = high_wmark_pages(zone);
if (!zone_watermark_ok_safe(zone, order, mark, classzone_idx))
return false;
/*
* If any eligible zone is balanced then the node is not considered
* to be congested or dirty
*/
clear_bit(PGDAT_CONGESTED, &zone->zone_pgdat->flags);
clear_bit(PGDAT_DIRTY, &zone->zone_pgdat->flags);
return true;
}
balance pgdat
kswapd回收页面的工作主要在balance_pgdat
先看一个页面扫面控制的结构,该结构控制kswapd的行为
struct scan_control { // 扫面控制
▸ /* How many pages shrink_list() should reclaim */
▸ unsigned long nr_to_reclaim; // 需要回收的数量
▸ /* This context's GFP mask */
▸ gfp_t gfp_mask; //分配位掩码
▸ /* Allocation order */
▸ int order; //分配的阶数
▸ /*
▸ * Nodemask of nodes allowed by the caller. If NULL, all nodes
▸ * are scanned.
▸ */
▸ nodemask_t▸ *nodemask;
▸ /*
▸ * The memory cgroup that hit its limit and as a result is the
▸ * primary target of this reclaim invocation.
▸ */
▸ struct mem_cgroup *target_mem_cgroup;
▸ /* Scan (total_size >> priority) pages at once */
▸ int priority; //lru链表扫描的优先级, priority越低扫描力度越大
▸ /* The highest zone to isolate pages for reclaim from */
▸ enum zone_type reclaim_idx; //在那个zone上分配内存时发现内存不足的
▸ unsigned int may_writepage:1; //是否可以回写
▸ /* Can mapped pages be reclaimed? */
▸ unsigned int may_unmap:1; //能否执行umap
▸ /* Can pages be swapped as part of reclaim? */
▸ unsigned int may_swap:1; //能否写到swap
▸ /* Can cgroups be reclaimed below their normal consumption range? */
▸ unsigned int may_thrash:1;
▸ unsigned int hibernation_mode:1;
▸ /* One of the zones is ready for compaction */
▸ unsigned int compaction_ready:1;
▸ /* Incremented by the number of inactive pages that were scanned */
▸ unsigned long nr_scanned; //扫描到的数量
▸ /* Number of pages freed so far during a call to shrink_zones() */
▸ unsigned long nr_reclaimed; //回收的数量
};
balance_pgdat的主要工作:
1.遍历classzone_idx:0,检查每个zone是否均衡,如果有一个zone是均衡的
则该pgdat不需要做均衡,退出
2.老化匿名active链表,让页面有机会在回收之前被引用
3.判断是否需要回写页面,判断是依据是:
a. sc.priority < DEF_PRIORITY - 2 :意思是前几次扫描的页面数量少于期望回收的页面数量,没有扫面到足够多的页面
b. !pgdat_reclaimable(pgdat) : 扫描的数量 >= 可回收的六倍
需要回写页面:sc.may_writepage = 1;
4.kswapd_shrink_node 缩小当前节点最高zone及其以下的zone,使其平衡
如果扫描的页面数量 >= 期望回收的页面,说明扫描了足够的页面数量,不需 要提高扫描力度raise_priority=fail, 否则提高 扫描力度raise_priority=ture;
5. 判断是否需要唤醒pgdat->pfmemalloc_wait上的进程
6. 判断是否需要提高扫描力度 sc.priority–;
7. 检查priority是否大于等于1,如果是返回1,重新开始扫描
static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
{
int i;
unsigned long nr_soft_reclaimed;
unsigned long nr_soft_scanned;
struct zone *zone;
struct scan_control sc = { //设置扫描控制
.gfp_mask = GFP_KERNEL,
.order = order, //扫描的page block的大小
.priority = DEF_PRIORITY, //扫描的优先级
.may_writepage = !laptop_mode, //是否可以回写
.may_unmap = 1, //是否可以unmap
.may_swap = 1, //是否可以swap
};
count_vm_event(PAGEOUTRUN);
do { //循环shrink_node,知道有一个zone是balance,或者priority小于1
bool raise_priority = true;
sc.nr_reclaimed = 0;
sc.reclaim_idx = classzone_idx;
/*
* If the number of buffer_heads exceeds the maximum allowed
* then consider reclaiming from all zones. This has a dual
* purpose -- on 64-bit systems it is expected that
* buffer_heads are stripped during active rotation. On 32-bit
* systems, highmem pages can pin lowmem memory and shrinking
* buffers can relieve lowmem pressure. Reclaim may still not
* go ahead if all eligible zones for the original allocation
* request are balanced to avoid excessive reclaim from kswapd.
*/
if (buffer_heads_over_limit) {
for (i = MAX_NR_ZONES - 1; i >= 0; i--) {
zone = pgdat->node_zones + i;
if (!managed_zone(zone))
continue;
sc.reclaim_idx = i;
break;
}
}
/*
* Only reclaim if there are no eligible zones. Check from
* high to low zone as allocations prefer higher zones.
* Scanning from low to high zone would allow congestion to be
* cleared during a very small window when a small low
* zone was balanced even under extreme pressure when the
* overall node may be congested. Note that sc.reclaim_idx
* is not used as buffer_heads_over_limit may have adjusted
* it.
*/
for (i = classzone_idx; i >= 0; i--) { //遍历从classzone_idx到0的各个zone
zone = pgdat->node_zones + i;
if (!managed_zone(zone))
continue;
if (zone_balanced(zone, sc.order, classzone_idx)) //如果有一个zone是balance则退出balance_pgdat
goto out;
}
/*
* Do some background aging of the anon list, to give
* pages a chance to be referenced before reclaiming. All
* pages are rotated regardless of classzone as this is
* about consistent aging.
*/
//老化匿名active链表,让页面有机会在回收之前被引用
age_active_anon(pgdat, &sc);
/*
* If we're getting trouble reclaiming, start doing writepage
* even in laptop mode.
*/
//判断是否可以回写页面,如果扫描二次扫描到页面都小于需要回收的页面
//可能有很多脏页面,需要回写
if (sc.priority < DEF_PRIORITY - 2 || !pgdat_reclaimable(pgdat))
sc.may_writepage = 1;
/* Call soft limit reclaim before calling shrink_node. */
sc.nr_scanned = 0;
nr_soft_scanned = 0;
nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(pgdat, sc.order,
sc.gfp_mask, &nr_soft_scanned);
sc.nr_reclaimed += nr_soft_reclaimed;
/*
* There should be no need to raise the scanning priority if
* enough pages are already being scanned that that high
* watermark would be met at 100% efficiency.
*/
//shrink node
//如果扫描的页面数量大于等于需要回收页面数量,不需要提高扫描优先级
//反之需要提高优先级
//kswapd_shrink_node是核心内容
if (kswapd_shrink_node(pgdat, &sc))
raise_priority = false;
/*
* If the low watermark is met there is no need for processes
* to be throttled on pfmemalloc_wait as they should not be
* able to safely make forward progress. Wake them
*/
if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
pfmemalloc_watermark_ok(pgdat))
wake_up_all(&pgdat->pfmemalloc_wait);
/* Check if kswapd should be suspending */
if (try_to_freeze() || kthread_should_stop())
break;
/*
* Raise priority if scanning rate is too low or there was no
* progress in reclaiming pages
*/
//如果需要提高有优先级,或者上一次没有回收到页面,就需要提高优先级
if (raise_priority || !sc.nr_reclaimed)
sc.priority--;
} while (sc.priority >= 1);
out:
/*
* Return the order kswapd stopped reclaiming at as
* prepare_kswapd_sleep() takes it into account. If another caller
* entered the allocator slow path while kswapd was awake, order will
* remain at the higher level.
*/
return sc.order;
}
内核线程kswapd是个大循环检查node下的各个zone是否均衡,如果所有的zone都均衡,则kswapd睡眠,如果发现有一个zone不均衡,则执行balance_pgdat,直到所有zone都均衡了,然后进入睡眠。
balance_pgdat也是个大循环,balance_pgdat检查的是该node下的zone是否均衡,但是扫描的时候是扫描的整个node,这也可以避免zone老化速度不同,直到有一个该node下的一个zone均衡为止