[scheduler]十. 传统的负载均衡是如何为task选择合适的cpu?

一 概述

之前在讲解新创建进程和idle进程被wake_up_process之后如何被调度器调度的原理,有两个点没有分析的很清楚,就是在这个调度的过程中,如何选择一个cpu来执行调度实体。现在单独拎出来详细分析:

  • 如果EAS feature启用的话,执行函数流:select_energy_cpu_brute
  • 如果EAS feature没有启用的话,执行传统的函数流:find_idlest_cpu

简单的流程图如下:

[scheduler]十. 传统的负载均衡是如何为task选择合适的cpu?

现在仅仅分析第二个部分,EAS没有启用的情况下,调度器如何实现进程的调度.即下面将要分析的source code如下:

static int  
select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags, int sibling_count_hint)  
{  
    struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;  
    int cpu = smp_processor_id();  
    int new_cpu = prev_cpu;  
    int want_affine = 0;  
    int sync = wake_flags & WF_SYNC;  
  
#ifdef CONFIG_64BIT_ONLY_CPU  
    struct cpumask tmpmask;  
  
    if (find_packing_cpu(p, &new_cpu))  
        return new_cpu;  
  
    cpumask_andnot(&tmpmask, cpu_present_mask, &b64_only_cpu_mask);  
    if (cpumask_test_cpu(cpu, &tmpmask)) {  
        if (weighted_cpuload_32bit(cpu) >  
            sysctl_sched_32bit_load_threshold &&  
            !test_tsk_thread_flag(p, TIF_32BIT))  
            return min_load_64bit_only_cpu();  
    }  
#endif  
  
    if (sd_flag & SD_BALANCE_WAKE) {  
        record_wakee(p);  
        want_affine = !wake_wide(p, sibling_count_hint) &&  
                  !wake_cap(p, cpu, prev_cpu) &&  
                  cpumask_test_cpu(cpu, &p->cpus_allowed);  
    }  
  
    if (energy_aware())  
        return select_energy_cpu_brute(p, prev_cpu, sync);  
     /* 下面是这个章节需要分析的内容. */
    rcu_read_lock();  
    for_each_domain(cpu, tmp) {  
        if (!(tmp->flags & SD_LOAD_BALANCE))  
            break;  
  
        /* 
         * If both cpu and prev_cpu are part of this domain, 
         * cpu is a valid SD_WAKE_AFFINE target. 
         */  
        if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&  
            cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {  
            affine_sd = tmp;  
            break;  
        }  
  
        if (tmp->flags & sd_flag)  
            sd = tmp;  
        else if (!want_affine)  
            break;  
    }  
  
    if (affine_sd) {  
        sd = NULL; /* Prefer wake_affine over balance flags */  
        if (cpu != prev_cpu && wake_affine(affine_sd, p, prev_cpu, sync))  
            new_cpu = cpu;  
    }  
  
    if (sd && !(sd_flag & SD_BALANCE_FORK)) {  
        /* 
         * We're going to need the task's util for capacity_spare_wake 
         * in find_idlest_group. Sync it up to prev_cpu's 
         * last_update_time. 
         */  
        sync_entity_load_avg(&p->se);  
    }  
  
    if (!sd) {  
        if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */  
            new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);  
  
    } else {  
        new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);  
    }  
    rcu_read_unlock();  
  
    return new_cpu;  
}  

下面根据代码结果分下面三个章节来分析上面剩余的代码:

  1. 根据want_affine变量选择调度域并确定new_cpu
  2. 根据调度域及其调度域参数选择兄弟idle cpu根据调度域及其调度域参数选择兄弟idle cpu
  3. 根据调度域选择最深idle的cpu根据调度域选择最深idle的cpu

二 根据want_affine变量选择调度域并确定new_cpu

我们知道如下的事实:

  • 进程p的调度域参数设置了SD_BALANCE_WAKE
  • 当前cpu的唤醒次数没有超标
  • 当前task p消耗的capacity * 1138小于min_cap * 1024
  • 当前cpu在task p的cpu亲和数里面的一个

只有上面三个条件全部成立,则want_affine=1
下面分析这部分代码:

for_each_domain(cpu, tmp) {
         /* 这个if永远不会成立,原因是在Sched domain初始化的时候已经设置了这个flag 
         在函数sd_init里面已经设置好了,也包括进程的几种状态下需要做balance的flag*/ 
        if (!(tmp->flags & SD_LOAD_BALANCE))  
            break;  
  
        /* 
         * If both cpu and prev_cpu are part of this domain, 
         * cpu is a valid SD_WAKE_AFFINE target. 
         */  
        if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&  
            cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {  
            affine_sd = tmp;  
            break;  
        }  
  
        if (tmp->flags & sd_flag)  
            sd = tmp;  
        else if (!want_affine)  
            break;  
    }  

在sd_init初始化的时候,设置的flags如下:

.flags          = 1*SD_LOAD_BALANCE  
            | 1*SD_BALANCE_NEWIDLE  
            | 1*SD_BALANCE_EXEC  
            | 1*SD_BALANCE_FORK  
            | 0*SD_BALANCE_WAKE  
            | 1*SD_WAKE_AFFINE  
            | 0*SD_SHARE_CPUCAPACITY  
            | 0*SD_SHARE_PKG_RESOURCES  
            | 0*SD_SERIALIZE  
            | 0*SD_PREFER_SIBLING  
            | 0*SD_NUMA  
fdef CONFIG_INTEL_DWS  
            | 0*SD_INTEL_DWS  
ndif  
            | sd_flags  
            ,  

2.1 根据affine_sd update new_cpu

对于for_each_domain(cpu, tmp)的遍历解释如下:

/* 从domain最底层的MC遍历到最高层的DIE层 */
for_each_domain(cpu, tmp) {
         /* 这个if永远不会成立,原因是在Sched domain初始化的时候已经设置了这个flag 
         在函数sd_init里面已经设置好了,也包括进程的几种状态下需要做balance的flag*/ 
        if (!(tmp->flags & SD_LOAD_BALANCE))  
            break;  
  
        /* 
         * If both cpu and prev_cpu are part of this domain, 
         * cpu is a valid SD_WAKE_AFFINE target. 
         */  
        /* sched_domain_span(tmp)即这个遍历的domain下面有多少个cpu以及id,
        在SDTL 最底层(比如ARM的MC)一个domain是一个cluster,最高层DIE则是所有的cpu 
        SD_WAKE_AFFINE flag被设置了.如果if条件成立,则将当前的domain赋值给affine_sd
        变量
         */
        
        if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&  
            cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {  
            affine_sd = tmp;  
            break;  
        }  
        /* balance flag符合进程在调用select_task_rq时候的flag */
        if (tmp->flags & sd_flag)  
            sd = tmp;  
        else if (!want_affine)  
            break;  
    }

我们知道当内核进程是从idle被wakeup起来的时候,sd_flag是被设置为SD_BALANCE_WAKE,但是在sd_init初始化的时候,没有设置这个flag,所以存在sd=NULL, affine_sd=NULL的情况出现.下面接着分析下面的函数:

/* affine_sd变量不为空,则根据相应的条件update new_cpu */
if (affine_sd) {  
    sd = NULL; /* Prefer wake_affine over balance flags */  
    if (cpu != prev_cpu && wake_affine(affine_sd, p, prev_cpu, sync))  
        new_cpu = cpu;  
}  

wake_affine函数源码分析之前,需要先知道三个load的计算方式如下:

  1. source_load(int cpu, int type)
  2. target_load(int cpu, int type)target_load(int cpu, int type)
  3. effective_load(struct task_group *tg, int cpu, long wl, long wg)effective_load(struct task_group *tg, int cpu, long wl, long wg)

分别分析如下.

2.1.1 source_load

其源码如下:

/* 
 * Return a low guess at the load of a migration-source cpu weighted 
 * according to the scheduling class and "nice" value. 
 * 
 * We want to under-estimate the load of migration sources, to 
 * balance conservatively. 
 */  
static unsigned long source_load(int cpu, int type)  
{  
    struct rq *rq = cpu_rq(cpu);
    /* 获取此cpu所在的cfs_rq的load */  
    unsigned long total = weighted_cpuload(cpu);  
    /* type=sd->wake_idx=0并且SCHED_FEAT(LB_BIAS, true),可以此if条件成立 */
    if (type == 0 || !sched_feat(LB_BIAS))  
        return total;  
  
    return min(rq->cpu_load[type-1], total);  
}  
  
/* Used instead of source_load when we know the type == 0 */  
static unsigned long weighted_cpuload(const int cpu)  
{  /* 获取此cpu上的cfs_rq的load,cpu上的load包括cfs_rq和rt_rq上的负载之和 */
    return cfs_rq_runnable_load_avg(&cpu_rq(cpu)->cfs);  
}  
  
static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)  
{  /* 获取cfs_rq平均运行load,在每次调度实体负载update的时候会更新 */
    return cfs_rq->runnable_load_avg;  
}  

2.1.2 target_load

/* 
 * Return a high guess at the load of a migration-target cpu weighted 
 * according to the scheduling class and "nice" value. 
 */  
static unsigned long target_load(int cpu, int type)  
{  
    struct rq *rq = cpu_rq(cpu);  
    unsigned long total = weighted_cpuload(cpu);  
  
    if (type == 0 || !sched_feat(LB_BIAS))  
        return total;  
  
    return max(rq->cpu_load[type-1], total);  
}  

函数与上面类似,如果type不为0,则source_load是选择最小数值,而target_load选择最大数值.

2.1.3 effective_load

源码如下:

#ifdef CONFIG_FAIR_GROUP_SCHED  
/* 
 * effective_load() calculates the load change as seen from the root_task_group 
 * 
 * Adding load to a group doesn't make a group heavier, but can cause movement 
 * of group shares between cpus. Assuming the shares were perfectly aligned one 
 * can calculate the shift in shares. 
 * 
 * Calculate the effective load difference if @wl is added (subtracted) to @tg 
 * on this @cpu and results in a total addition (subtraction) of @wg to the 
 * total group weight. 
 * 
 * Given a runqueue weight distribution (rw_i) we can compute a shares 
 * distribution (s_i) using: 
 * 
 *   s_i = rw_i / \Sum rw_j                     (1) 
 * 
 * Suppose we have 4 CPUs and our @tg is a direct child of the root group and 
 * has 7 equal weight tasks, distributed as below (rw_i), with the resulting 
 * shares distribution (s_i): 
 * 
 *   rw_i = {   2,   4,   1,   0 } 
 *   s_i  = { 2/7, 4/7, 1/7,   0 } 
 * 
 * As per wake_affine() we're interested in the load of two CPUs (the CPU the 
 * task used to run on and the CPU the waker is running on), we need to 
 * compute the effect of waking a task on either CPU and, in case of a sync 
 * wakeup, compute the effect of the current task going to sleep. 
 * 
 * So for a change of @wl to the local @cpu with an overall group weight change 
 * of @wl we can compute the new shares distribution (s'_i) using: 
 * 
 *   s'_i = (rw_i + @wl) / (@wg + \Sum rw_j)                (2) 
 * 
 * Suppose we're interested in CPUs 0 and 1, and want to compute the load 
 * differences in waking a task to CPU 0. The additional task changes the 
 * weight and shares distributions like: 
 * 
 *   rw'_i = {   3,   4,   1,   0 } 
 *   s'_i  = { 3/8, 4/8, 1/8,   0 } 
 * 
 * We can then compute the difference in effective weight by using: 
 * 
 *   dw_i = S * (s'_i - s_i)                        (3) 
 * 
 * Where 'S' is the group weight as seen by its parent. 
 * 
 * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7) 
 * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 - 
 * 4/7) times the weight of the group. 
 */  
static long effective_load(struct task_group *tg, int cpu, long wl, long wg)  
{  
    struct sched_entity *se = tg->se[cpu];  
  
    if (!tg->parent) /* the trivial, non-cgroup case */  
        return wl;  
  
    for_each_sched_entity(se) {  
        struct cfs_rq *cfs_rq = se->my_q;  
        long W, w = cfs_rq_load_avg(cfs_rq);  
  
        tg = cfs_rq->tg;  
  
        /* 
         * W = @wg + \Sum rw_j 
         */  
        W = wg + atomic_long_read(&tg->load_avg);  
  
        /* Ensure \Sum rw_j >= rw_i */  
        W -= cfs_rq->tg_load_avg_contrib;  
        W += w;  
  
        /* 
         * w = rw_i + @wl 
         */  
        w += wl;  
  
        /* 
         * wl = S * s'_i; see (2) 
         */
         /* tg->shares可能会被函数sched_group_set_shares修改,但是
          root_task_group.shares = ROOT_TASK_GROUP_LOAD在sched_init调度器
         初始化的时候设置了为nice=0的权重为1024 */   
        if (W > 0 && w < W)  
            wl = (w * (long)tg->shares) / W;  
        else  
            wl = tg->shares;  
  
        /* 
         * Per the above, wl is the new se->load.weight value; since 
         * those are clipped to [MIN_SHARES, ...) do so now. See 
         * calc_cfs_shares(). 
         */  
        if (wl < MIN_SHARES)  
            wl = MIN_SHARES;  
  
        /* 
         * wl = dw_i = S * (s'_i - s_i); see (3) 
         */  
        wl -= se->avg.load_avg;  
  
        /* 
         * Recursively apply this logic to all parent groups to compute 
         * the final effective load change on the root group. Since 
         * only the @tg group gets extra weight, all parent groups can 
         * only redistribute existing shares. @wl is the shift in shares 
         * resulting from this level per the above. 
         */  
        wg = 0;  
    }  
  
    return wl;  
}  
#else  
  
static long effective_load(struct task_group *tg, int cpu, long wl, long wg)  
{  
    return wl;  
}  
  
#endif  

这个函数在定义之前已经讲解的很清楚,effective_load怎么计算的.具体如下:

  1. 获得有调度实体的平均负载,这是第一步.数值为se->avg.load_avg
  2. 根据当前进程负载迁移情况,重新获取调度实体的平均负载:根据当前进程负载迁移情况,重新获取调度实体的平均负载:
    即大W = 进程组的负载+cfs_rq的负载 + 新进程的负载-之前cfs_rq的进程组贡献的负载[email protected] + \Sum rw_j
    小w=cfs_rq负载+新进程的负载=rw_i + @wl
  3. 返回diff数值,第二步数值减去第一步数值

for_each_sched_entity从最底层的se开始遍历一直到这个se所在的根节点:root_task_group

2.1.4 核心函数wake_affine分析

源码分析如下:

static int wake_affine(struct sched_domain *sd, struct task_struct *p,  
               int prev_cpu, int sync)  
{  
    s64 this_load, load;  
    s64 this_eff_load, prev_eff_load;  
    int idx, this_cpu;  
    struct task_group *tg;  
    unsigned long weight;  
    int balanced;  
    /* sched_domain的wake_idx是sd_alloc_ctl_domain_table函数分配的一个table的
     元素.默认数值为0,在sd_init初始化了.  默认数值为0,可以通过sys接口修改,属于
             schedctl interface */
    idx   = sd->wake_idx;     
    this_cpu  = smp_processor_id();/*获取当前运行此函数的cpu id*/  
    load      = source_load(prev_cpu, idx);  
    this_load = target_load(this_cpu, idx);  
  
    /* 
     * If sync wakeup then subtract the (maximum possible) 
     * effect of the currently running task from the load 
     * of the current CPU: 
     */  /* sync=wake_flag & WF_SYNC = 0 & 0x01 = 0,目前看到所有的wake_flag都
     被设置为0了  */
    if (sync) {  
        tg = task_group(current);  
        weight = current->se.avg.load_avg;  
  
        this_load += effective_load(tg, this_cpu, -weight, -weight);  
        load += effective_load(tg, prev_cpu, 0, -weight);  
    }  
  
    tg = task_group(p);  
    weight = p->se.avg.load_avg;  
  
    /* 
     * In low-load situations, where prev_cpu is idle and this_cpu is idle 
     * due to the sync cause above having dropped this_load to 0, we'll 
     * always have an imbalance, but there's really nothing you can do 
     * about that, so that's good too. 
     * 
     * Otherwise check if either cpus are near enough in load to allow this 
     * task to be woken on this_cpu. 
     *//*  下面所在的事情就是决定进程P放入到当前 this cpu上或者放入prev_cpu上负载的影
      响 */  
    this_eff_load = 100;  
    this_eff_load *= capacity_of(prev_cpu);  
  
    prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;  
    prev_eff_load *= capacity_of(this_cpu);  
  
    if (this_load > 0) {  
        this_eff_load *= this_load +  
            effective_load(tg, this_cpu, weight, weight);  
  
        prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);  
    }  
    /* 确定是否进行进程的迁移,需要明白,在计算prev_eff_load的时候,为何effective_load
    的第三个参数为0呢?是因为prev_cpu是进程P当前运行的进程.只有进程需要迁移到某个cpu上,第
    三个参数才会设置数值. */
    balanced = this_eff_load <= prev_eff_load;  
  
    schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);  
  
    if (!balanced)  
        return 0;  
  
    schedstat_inc(sd, ttwu_move_affine);  
    schedstat_inc(p, se.statistics.nr_wakeups_affine);  
  
    return 1;  
} 

只有当当前cpu与prev_cpu不是同一个cpu并且wake_affine函数返回为1,则new_cpu = 当前cpu. 即可以将进程P迁移到new_cpu上去了.下面继续分析:

/*这个是用来判断sd是否为空,以及调度域flag是否为SD_BALANCE_FORK*/
if (sd && !(sd_flag & SD_BALANCE_FORK)) {  
    /* 
     * We're going to need the task's util for capacity_spare_wake 
     * in find_idlest_group. Sync it up to prev_cpu's 
     * last_update_time. 
     * 对进程P作为调度实体进程负载的更新(PELT算法)
    */  
    sync_entity_load_avg(&p->se);  
}  

2.2 核心函数select_idle_sibling分析

我们知道只要affine_sd !=NULL,那么sd变量会设置为空,那么选择new_cpu的代码路径就会走select_idle_sibling函数:

 if (!sd) {
        /*如果affine_sd不为空,则sd就被设置为空 SD_BALACNE_WAKE flag被设置 */
        if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */  
            new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);  
/* 
 * Try and locate an idle CPU in the sched_domain. 
 */  
static int select_idle_sibling(struct task_struct *p, int prev, int target)  
{  
    struct sched_domain *sd;  
    struct sched_group *sg;  
    int best_idle_cpu = -1;  
    int best_idle_cstate = INT_MAX;  
    unsigned long best_idle_capacity = ULONG_MAX;  
  
    schedstat_inc(p, se.statistics.nr_wakeups_sis_attempts);  
    schedstat_inc(this_rq(), eas_stats.sis_attempts);  
    /*  这里是否设置了cstate感知调度器控制属性,即根据idle的深浅来做出不同的决策 */
    if (!sysctl_sched_cstate_aware) {
    /* 如果没有设置,则直接判断new_cpu是否idle而不管系统存在更钱的idle状态.只要是idle
    就直接返回. */  
        if (idle_cpu(target)) {  
            schedstat_inc(p, se.statistics.nr_wakeups_sis_idle);  
            schedstat_inc(this_rq(), eas_stats.sis_idle);  
            return target;  
        }  
        /* intel 平台 */
#ifdef CONFIG_INTEL_DWS  
        if (sched_feat(INTEL_DWS)) {  
            /* 
             * For either waker or wakee CPU, if it is idle, then select it, but 
             * if not, we lower down the bar to use a threshold of runnable avg 
             * to determine whether it is capable of handling the wakee task 
             */  
            if (sysctl_sched_wakeup_threshold && cpu_more_runnable(target))  
                return target;  
  
            if (prev != target) {  
                /* 
                 * If the prevous cpu is cache affine and idle, don't be stupid. 
                 */  
                if (cpus_share_cache(prev, target) && idle_cpu(prev))  
                    return prev;  
                if (sysctl_sched_wakeup_threshold && cpu_more_runnable(prev))  
                    return prev;  
            }  
        } else  
#endif  
        /* 
         * If the prevous cpu is cache affine and idle, don't be stupid. 
         */   /*如果进程p原先运行的cpu(prev)不等于目标cpu && prev与target共享 
        cache && prev是idle状态,则目标cpu为prev cpu*/  
        if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev)) {  
            schedstat_inc(p, se.statistics.nr_wakeups_sis_cache_affine);  
            schedstat_inc(this_rq(), eas_stats.sis_cache_affine);  
            return prev;  
        }  
    }  
  
    /* 
     * Otherwise, iterate the domains and find an elegible idle cpu. 
     */  
    sd = rcu_dereference(per_cpu(sd_llc, target)); 
     /* 下面寻找处于最浅idle状态的最小capacity的cpu来放置迁移进程P */ 
    for_each_lower_domain(sd) {  
        sg = sd->groups;  
        do {  
            int i;
             /* 进程P cpu亲和数与调度组span cpu数没有交集,则直接遍历下一个sg */  
            if (!cpumask_intersects(sched_group_cpus(sg),  
                        tsk_cpus_allowed(p)))  
                goto next;  
  
            if (sysctl_sched_cstate_aware) {  
                for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) {   /*获取cpu的idle状态索引*/
                    int idle_idx = idle_get_state_idx(cpu_rq(i));  
                    unsigned long new_usage = boosted_task_util(p);  
                    unsigned long capacity_orig = capacity_orig_of(i);  
  
                    if (new_usage > capacity_orig || !idle_cpu(i))  
                        goto next;  
                     /* 遍历到的cpu为new_cpu id并且进程P的util小于cpu的capacity
                     ,则直接返回目标cpu(new_cpu) */
                    if (i == target && new_usage <= capacity_curr_of(target)) {  
                        schedstat_inc(p, se.statistics.nr_wakeups_sis_suff_cap);  
                        schedstat_inc(this_rq(), eas_stats.sis_suff_cap);  
                        schedstat_inc(sd, eas_stats.sis_suff_cap);  
                        return target;  
                    }  
  			 /* 找出最浅idle状态并且最小capacity的cpu id */
                    if (idle_idx < best_idle_cstate &&  
                        capacity_orig <= best_idle_capacity) {  
                        best_idle_cpu = i;   /* 保存这个选择的id */
                        best_idle_cstate = idle_idx;  
                        best_idle_capacity = capacity_orig;  
                    }  
                }  
            } else {   /*如果没有定义感知idle深浅状态的flag,则只要目标cpu是idle就
                  OK*/
                for_each_cpu(i, sched_group_cpus(sg)) {  
                    if (i == target || !idle_cpu(i))  
                        goto next;  
                }  
                 /* 如果上面的循环失败, 则挑选调度组内第一个cpu*/
                target = cpumask_first_and(sched_group_cpus(sg),  
                    tsk_cpus_allowed(p));  
                schedstat_inc(p, se.statistics.nr_wakeups_sis_idle_cpu);  
                schedstat_inc(this_rq(), eas_stats.sis_idle_cpu);  
                schedstat_inc(sd, eas_stats.sis_idle_cpu);  
                goto done;  
            }  
next:  
            sg = sg->next;  
        } while (sg != sd->groups);  
    }  
  
    if (best_idle_cpu >= 0)  
        target = best_idle_cpu;  
  
done:  
    schedstat_inc(p, se.statistics.nr_wakeups_sis_count);  
    schedstat_inc(this_rq(), eas_stats.sis_count);  
  
    return target;  
}  

原理比较简单.

三 核心函数find_idlest_cpu分析

对于这个核心函数的分析,需要理解下面几个核心子函数

  1. find_idlest_group(sd, p, cpu, sd_flag)
  2. find_idlest_group_cpu(group, p, cpu)find_idlest_group_cpu(group, p, cpu)

两个是递进的关系,查找到了idlest的group之后,在此group中查找idlest的cpu,最后来确定目标cpu的id.分别来分析

3.1 子核心函数find_idlest_group

由于看的是ARM平台,选择ARM平台调用的函数:

/* 
 * find_idlest_group finds and returns the least busy CPU group within the 
 * domain. 
 * 
 * Assumes p is allowed on at least one CPU in sd. 
 */  
  
 
static struct sched_group *  
find_idlest_group(struct sched_domain *sd, struct task_struct *p,  
          int this_cpu, int sd_flag)  
{  
    struct sched_group *idlest = NULL, *group = sd->groups;  
    struct sched_group *most_spare_sg = NULL;  
    unsigned long min_load = ULONG_MAX, this_load = ULONG_MAX;  
    unsigned long most_spare = 0, this_spare = 0;  
    int load_idx = sd->forkexec_idx;  
    int imbalance = 100 + (sd->imbalance_pct-100)/2;  
    /*根据sd_flag数值,设定load_idx数值(只有被wakeup的进程才会设置)*/
    if (sd_flag & SD_BALANCE_WAKE)  
        load_idx = sd->wake_idx;  
    /*开始对sd内的所有sg遍历*/
    do {  
        unsigned long load, avg_load, spare_cap, max_spare_cap;  
        int local_group;  
        int i;  
         /*sg内的cpu与进程的cpu亲和数没有交集,直接进行下次遍历*/
        /* Skip over this group if it has no CPUs allowed */  
        if (!cpumask_intersects(sched_group_cpus(group),  
                    tsk_cpus_allowed(p)))  
            continue;  
/*由于 sd是最底层MC SDTL,所以cpumask_weight(sched_group_cpus(group))=1
        local_group目的是确定this_cpu是否在本次遍历的group中*/
        local_group = cpumask_test_cpu(this_cpu,  
                           sched_group_cpus(group));  
  
        /* 
         * Tally up the load of all CPUs in the group and find 
         * the group containing the CPU with most spare capacity. 
         */  
        avg_load = 0;  
        max_spare_cap = 0;  
        /*在这个group计算累加负载和最大没有使用的capacity数值*/
        for_each_cpu(i, sched_group_cpus(group)) {  
            /* Bias balancing toward cpus of our domain */  
            if (local_group)  
                load = source_load(i, load_idx);  
            else  
                load = target_load(i, load_idx);  
  	     /*累加负载,后面会归一化为相对负载*/
            avg_load += load;  
            /*cpu i的剩余capacity余量,即capacity_of-cpu_util*/
            spare_cap = capacity_spare_wake(i, p);  
  	     /*计算整个循环周期内的最大capacity余量*/
            if (spare_cap > max_spare_cap)  
                max_spare_cap = spare_cap;  
        }  
        /*归一化为相对负载*/
        /* Adjust by relative CPU capacity of the group */  
        avg_load = (avg_load * SCHED_CAPACITY_SCALE) / group->sgc->capacity;  
  	 /*根据每次遍历的local_group的数值,来update相应的一些决策变量.*/
        if (local_group) {  
            this_load = avg_load;  
            this_spare = max_spare_cap;  
        } else { 
            /*获取最小load的group*/ 
            if (avg_load < min_load) {  
                min_load = avg_load;  
                idlest = group;  
            }  
  	     /*获取最大capacity余量的group*/
            if (most_spare < max_spare_cap) {  
                most_spare = max_spare_cap;  
                most_spare_sg = group;  
            }  
        }  
    } while (group = group->next, group != sd->groups);  
  
    /* 
     * The cross-over point between using spare capacity or least load 
     * is too conservative for high utilization tasks on partially 
     * utilized systems if we require spare_capacity > task_util(p), 
     * so we allow for some task stuffing by using 
     * spare_capacity > task_util(p)/2. 
     * 
     * Spare capacity can't be used for fork because the utilization has 
     * not been set yet, we must first select a rq to compute the initial 
     * utilization. 
     */  /*下面开始根据设定的策略来决策使用哪种方式作为idlest_group*/
     /*1.如果进程p是新创建的进程,直接选择负载最轻的group
       2.如果capacity余量大于进程p自身的util的一半,并且imbalance参数大于一定数值
     则返回为空
       3.如果最大capacity余量大于进程p的util的一半,则返回最大余量的group
       4.最后根据实际情况选择idlest的group*/ 
    if (sd_flag & SD_BALANCE_FORK)  
        goto skip_spare;  
  
    if (this_spare > task_util(p) / 2 &&  
        imbalance*this_spare > 100*most_spare)  
        return NULL;  
    else if (most_spare > task_util(p) / 2)  
        return most_spare_sg;  
  
skip_spare:  
    if (!idlest || 100*this_load < imbalance*min_load)  
        return NULL;  
    return idlest;  
}  

比较好理解

3.2 子核心函数find_idlest_group_cpu

既然已经选择了idlest group,那么开始选择idlest group内的idlest cpu了.

/* 
 * find_idlest_group_cpu - find the idlest cpu among the cpus in group. 
 */  
static int  
find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)  
{  
    unsigned long load, min_load = ULONG_MAX;  
    unsigned int min_exit_latency = UINT_MAX;  
    u64 latest_idle_timestamp = 0;  
    int least_loaded_cpu = this_cpu;  
    int shallowest_idle_cpu = -1;  
    int i;  
  
    /* Check if we have any choice: */  
    if (group->group_weight == 1)  
        return cpumask_first(sched_group_cpus(group));  
  
    /* Traverse only the allowed CPUs */  
    for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {  
        if (idle_cpu(i)) {  
            struct rq *rq = cpu_rq(i);  
            struct cpuidle_state *idle = idle_get_state(rq);
            /*cpu处于idle并且退出时延最小,那么cpu肯定是最浅idle状态的cpu了,idle状态
            越深,退出时延越大.*/  
            if (idle && idle->exit_latency < min_exit_latency) {  
                /* 
                 * We give priority to a CPU whose idle state 
                 * has the smallest exit latency irrespective 
                 * of any idle timestamp. 
                 */  
                min_exit_latency = idle->exit_latency;  
                latest_idle_timestamp = rq->idle_stamp;  
                shallowest_idle_cpu = i;
             /*如果退出时延是最小退出时延并且 此cpu之前进入过idle状态.那么挑选刚刚进入
             idle的cpu最为idle状态最浅的cpu.注释很清楚*/  
            } else if ((!idle || idle->exit_latency == min_exit_latency) &&  
                   rq->idle_stamp > latest_idle_timestamp) {  
                /* 
                 * If equal or no active idle state, then 
                 * the most recently idled CPU might have 
                 * a warmer cache. 
                 */  
                latest_idle_timestamp = rq->idle_stamp;  
                shallowest_idle_cpu = i;  
            }  
         /*如果没有cpu处于idle,那么选择load最轻的cpu作为返回值*/
        } else if (shallowest_idle_cpu == -1) {  
            load = weighted_cpuload(i);  
            if (load < min_load || (load == min_load && i == this_cpu)) {  
                min_load = load;  
                least_loaded_cpu = i;  
            }  
        }  
    }  
    /*根据系统是否有idle cpu来决策是选择最浅idle状态的cpu还是选择最轻负载的cpu*/
    return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;  
}  

原理也比较简单.
两个子核心函数分析完毕,下面分析这个源码:

3.3 核心函数find_idlest_cpu分析

static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p, int cpu, int prev_cpu, int sd_flag)  
{  
    int new_cpu = cpu;  
    int wu = sd_flag & SD_BALANCE_WAKE;  
    int cas_cpu = -1;  
  
    if (wu) {  
        schedstat_inc(p, se.statistics.nr_wakeups_cas_attempts);  
        schedstat_inc(this_rq(), eas_stats.cas_attempts);  
    }  
    /*进程p的cpu亲和数与调度域的span cpu是否存在交集*/
    if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed))  
        return prev_cpu;  
    /*在此调度域内查找出idlest group,进而查找出idlest cpu*/
    while (sd) {  
        struct sched_group *group = NULL;  
        struct sched_domain *tmp;  
        int weight;  
  
        if (wu)  
            schedstat_inc(sd, eas_stats.cas_attempts);  
  
        if (!(sd->flags & sd_flag)) {  
            sd = sd->child;  
            continue;  
        }  
  
#ifdef CONFIG_INTEL_DWS  
            if (sched_feat(INTEL_DWS)) {  
                if (sd->flags & SD_INTEL_DWS)  
                    group = dws_find_group(sd, p, cpu);  
                if (!group)  
                    group = find_idlest_group(sd, p, cpu, sd_flag);  
            } else  
#endif  /*查找出最小load/最大capacity余量的group*/
        group = find_idlest_group(sd, p, cpu, sd_flag);  
        if (!group) {  
            sd = sd->child;  
            continue;  
        }  
        /*查找出最浅idle状态/最小负载的cpu id*/
        new_cpu = find_idlest_group_cpu(group, p, cpu);  
        if (new_cpu == cpu) {  
            /* Now try balancing at a lower domain level of cpu */  
            sd = sd->child;  
            continue;  
        }  
  
        /* Now try balancing at a lower domain level of new_cpu */  
        cpu = cas_cpu = new_cpu;  
        weight = sd->span_weight;  
        sd = NULL;  /*循环中断条件设置*/
        for_each_domain(cpu, tmp) {
            /*如果tmp与sd不处于同一个level的话,数据会有差异,比如两个  
         SDTL(MC(child)/DIE(parent))
             架构是8核心两个cluster,那么DIE的weight大于MC的weight.这个for循环的目的
是需要遍历顶层sd下面的sg.因为传参过来的sd可能是MC/DIE两者之一*/  
            if (weight <= tmp->span_weight)  
                break;  
            if (tmp->flags & sd_flag)  
                sd = tmp;  
        }  
        /* while loop will break here if sd == NULL */  
    }  
  
    if (wu && (cas_cpu >= 0)) {  
        schedstat_inc(p, se.statistics.nr_wakeups_cas_count);  
        schedstat_inc(this_rq(), eas_stats.cas_count);  
    }  
    
    return new_cpu;  
}  

原理也比较简单.
重点还是理解调度域和调度组的关系.