當(dāng)前位置：首頁(yè) > 编程资源 > 综合教程 >内容正文

综合教程

WALT（Window Assisted Load Tracking）学习

發(fā)布時(shí)間：2024/9/19 综合教程 33 生活家

生活随笔收集整理的這篇文章主要介紹了 WALT（Window Assisted Load Tracking）学习小編覺(jué)得挺不錯(cuò)的,現(xiàn)在分享給大家,幫大家做個(gè)參考.

QCOM平臺(tái)使用WALT（Window Assisted Load Tracking）作為CPU load tracking的方法；相對(duì)地，ARM使用的是PELT（Per-Entity Load Tracking）。

WALT的核心算法思想是：將一小段時(shí)間內(nèi)的CPU loading情況計(jì)算出對(duì)應(yīng)的結(jié)果，作為一個(gè)window；然后再統(tǒng)計(jì)多個(gè)類似的window。通過(guò)計(jì)算，得出task demand，最后將結(jié)果運(yùn)用于CPU 頻率調(diào)節(jié)，負(fù)載均衡（task遷移）。

主要代碼在walt.c中。代碼基于kernel-4.19版本。

WALT核心結(jié)構(gòu)體

struct rq {
...
#ifdef CONFIG_SCHED_WALT
    struct sched_cluster    *cluster;
    struct cpumask        freq_domain_cpumask;
    struct walt_sched_stats walt_stats;
    
    u64            window_start;
    s64            cum_window_start;
    unsigned long        walt_flags;
    
    u64            cur_irqload;
    u64            avg_irqload;
    u64            irqload_ts;
    struct task_struct    *ed_task;
    struct cpu_cycle    cc;
    u64            old_busy_time, old_busy_time_group;
    u64            old_estimated_time;
    u64            curr_runnable_sum;
    u64            prev_runnable_sum;
    u64            nt_curr_runnable_sum;
    u64            nt_prev_runnable_sum;
    u64            cum_window_demand_scaled;
    struct group_cpu_time    grp_time;
    struct load_subtractions load_subs[NUM_TRACKED_WINDOWS];
    DECLARE_BITMAP_ARRAY(top_tasks_bitmap,
            NUM_TRACKED_WINDOWS, NUM_LOAD_INDICES);
    u8            *top_tasks[NUM_TRACKED_WINDOWS];
    u8            curr_table;
    int            prev_top;
    int            curr_top;
    bool            notif_pending;
    u64            last_cc_update;
    u64            cycles;
#endif /* CONFIG_SCHED_WALT */
...
}


struct task_struct {
...
#ifdef CONFIG_SCHED_WALT
    struct ravg ravg;
    /*
     * 'init_load_pct' represents the initial task load assigned to children
     * of this task
     */
    u32 init_load_pct;
    u64 last_wake_ts;
    u64 last_enqueued_ts;
    struct related_thread_group *grp;
    struct list_head grp_list;
    u64 cpu_cycles;
    bool misfit;
    u8 unfilter;
#endif
...
}


#ifdef CONFIG_SCHED_WALT
/* ravg represents frequency scaled cpu-demand of tasks */
struct ravg {
    /*
     * 'mark_start' marks the beginning of an event (task waking up, task
     * starting to execute, task being preempted) within a window
     *
     * 'sum' represents how runnable a task has been within current
     * window. It incorporates both running time and wait time and is
     * frequency scaled.
     *
     * 'sum_history' keeps track of history of 'sum' seen over previous
     * RAVG_HIST_SIZE windows. Windows where task was entirely sleeping are
     * ignored.
     *
     * 'demand' represents maximum sum seen over previous
     * sysctl_sched_ravg_hist_size windows. 'demand' could drive frequency
     * demand for tasks.
     *
     * 'curr_window_cpu' represents task's contribution to cpu busy time on
     * various CPUs in the current window
     *
     * 'prev_window_cpu' represents task's contribution to cpu busy time on
     * various CPUs in the previous window
     *
     * 'curr_window' represents the sum of all entries in curr_window_cpu
     *
     * 'prev_window' represents the sum of all entries in prev_window_cpu
     *
     * 'pred_demand' represents task's current predicted cpu busy time
     *
     * 'busy_buckets' groups historical busy time into different buckets
     * used for prediction
     *
     * 'demand_scaled' represents task's demand scaled to 1024
     */
    u64 mark_start;
    u32 sum, demand;
    u32 coloc_demand;
    u32 sum_history[RAVG_HIST_SIZE_MAX];
    u32 *curr_window_cpu, *prev_window_cpu;
    u32 curr_window, prev_window;
    u16 active_windows;
    u32 pred_demand;
    u8 busy_buckets[NUM_BUSY_BUCKETS];
    u16 demand_scaled;
    u16 pred_demand_scaled;
};
#endif

負(fù)載記錄

WALT中，使用demand記錄task負(fù)載

static inline unsigned long task_util(struct task_struct *p)
{
#ifdef CONFIG_SCHED_WALT
    return p->ravg.demand_scaled;　　　　//task負(fù)載
#endif
    return READ_ONCE(p->se.avg.util_avg);
}

使用cumulative_runnable_avg_scaled記錄cpu負(fù)載

static inline unsigned long cpu_util(int cpu)
{
    struct cfs_rq *cfs_rq;
    unsigned int util;

#ifdef CONFIG_SCHED_WALT
    u64 walt_cpu_util =
        cpu_rq(cpu)->walt_stats.cumulative_runnable_avg_scaled;　　//cpu負(fù)載

    return min_t(unsigned long, walt_cpu_util, capacity_orig_of(cpu));
#endif

    cfs_rq = &cpu_rq(cpu)->cfs;
    util = READ_ONCE(cfs_rq->avg.util_avg);

    if (sched_feat(UTIL_EST))
        util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued));

    return min_t(unsigned long, util, capacity_orig_of(cpu));
}

static inline unsigned long cpu_util_cum(int cpu, int delta)
{
    u64 util = cpu_rq(cpu)->cfs.avg.util_avg;
    unsigned long capacity = capacity_orig_of(cpu);

#ifdef CONFIG_SCHED_WALT
    util = cpu_rq(cpu)->cum_window_demand_scaled;　　//處于當(dāng)前運(yùn)行的task util？目前還沒(méi)搞清楚
#endif
    delta += util;
    if (delta < 0)
        return 0;

    return (delta >= capacity) ? capacity : delta;
}

WALT機(jī)制觸發(fā)的時(shí)刻

WALT主要機(jī)制

　　1. task load，cpu load統(tǒng)計(jì)

/* Reflect task activity on its demand and cpu's busy time statistics */
void update_task_ravg(struct task_struct *p, struct rq *rq, int event,
                        u64 wallclock, u64 irqtime)
{
    u64 old_window_start;

    if (!rq->window_start || sched_disable_window_stats ||
        p->ravg.mark_start == wallclock)　　　　//3個(gè)直接return的條件：walt算法沒(méi)有開(kāi)始；臨時(shí)關(guān)閉了walt；wallclock沒(méi)更新，不用做重復(fù)工作
        return;

    lockdep_assert_held(&rq->lock);

    old_window_start = update_window_start(rq, wallclock, event);　　//這里是算法剛剛開(kāi)始，所以只是獲取window start

    if (!p->ravg.mark_start) {　　　　　　　　　　　　　　　　　　//第一次進(jìn)入沒(méi)有標(biāo)記walt算法的開(kāi)始：mark_start。那么就直接goto done；
        update_task_cpu_cycles(p, cpu_of(rq), wallclock);　　//同時(shí)，更新cpu cycle count（后續(xù)在scale_exec_time中計(jì)算cpu freq）
        goto done;
    }

    update_task_rq_cpu_cycles(p, rq, event, wallclock, irqtime);　　//同上，更新cpu cycle count（后續(xù)在scale_exec_time中計(jì)算cpu freq），比上面多一個(gè)idle task的判斷處理
    update_task_demand(p, rq, event, wallclock);　　　　　　　　 //（1.）walt，更新task demand
    update_cpu_busy_time(p, rq, event, wallclock, irqtime);　　//（2.）walt，更新cpu busy time
    update_task_pred_demand(rq, p, event);　　　　　　　　　　　　//（3.）walt，更新預(yù)測(cè)的task demand

    if (exiting_task(p))　　　　　　//exiting task的情況下，不記錄trace log
        goto done;

　　//trace logs
    trace_sched_update_task_ravg(p, rq, event, wallclock, irqtime,
                rq->cc.cycles, rq->cc.time, &rq->grp_time);
    trace_sched_update_task_ravg_mini(p, rq, event, wallclock, irqtime,
                rq->cc.cycles, rq->cc.time, &rq->grp_time);

done:
    p->ravg.mark_start = wallclock; 　　　　　　　　　　//更新mark_start，記錄下一次walt算法開(kāi)始的時(shí)間

    run_walt_irq_work(old_window_start, rq);　　　　 //（4.）walt，針對(duì)irq情況的處理
}

1. 統(tǒng)計(jì)cpu task的demand AND/OR 更新對(duì)應(yīng)cpu的demand history。

注釋主要講了3種（a、b、c）可能情況下的ravg.sum負(fù)載統(tǒng)計(jì)方法。都是wallclock-mark_start歸一化時(shí)間，對(duì)于irqtime為1的情況可以看code稍有不同，但是原理是類似的。

/*
 * Account cpu demand of task and/or update task's cpu demand history
 *
 * ms = p->ravg.mark_start;
 * wc = wallclock
 * ws = rq->window_start
 *
 * Three possibilities:
 *
 *    a) Task event is contained within one window.
 *        window_start < mark_start < wallclock
 *
 *        ws   ms  wc
 *        |    |   |
 *        V    V   V
 *        |---------------|
 *
 *    In this case, p->ravg.sum is updated *iff* event is appropriate
 *    (ex: event == PUT_PREV_TASK)
 *
 *    b) Task event spans two windows.
 *        mark_start < window_start < wallclock
 *
 *        ms   ws   wc
 *        |    |    |
 *        V    V    V
 *        -----|-------------------
 *
 *    In this case, p->ravg.sum is updated with (ws - ms) *iff* event
 *    is appropriate, then a new window sample is recorded followed
 *    by p->ravg.sum being set to (wc - ws) *iff* event is appropriate.
 *
 *    c) Task event spans more than two windows.
 *
 *        ms ws_tmp               ws  wc
 *        |  |                   |   |
 *        V  V                   V   V
 *        ---|-------|-------|-------|-------|------
 *           |                   |
 *           |<------ nr_full_windows ------>|
 *
 *    In this case, p->ravg.sum is updated with (ws_tmp - ms) first *iff*
 *    event is appropriate, window sample of p->ravg.sum is recorded,
 *    'nr_full_window' samples of window_size is also recorded *iff*
 *    event is appropriate and finally p->ravg.sum is set to (wc - ws)
 *    *iff* event is appropriate.
 *
 * IMPORTANT : Leave p->ravg.mark_start unchanged, as update_cpu_busy_time()
 * depends on it!
 */
static u64 update_task_demand(struct task_struct *p, struct rq *rq,
                   int event, u64 wallclock)
{
    u64 mark_start = p->ravg.mark_start;
    u64 delta, window_start = rq->window_start;
    int new_window, nr_full_windows;
    u32 window_size = sched_ravg_window;
    u64 runtime;

    new_window = mark_start < window_start;　　　　　　//當(dāng)new_window=1時(shí)，情況為b、c
    if (!account_busy_for_task_demand(rq, p, event)) {　　//判斷該task是否會(huì)工作導(dǎo)致busy
        if (new_window)
            /*
             * If the time accounted isn't being accounted as
             * busy time, and a new window started, only the
             * previous window need be closed out with the
             * pre-existing demand. Multiple windows may have
             * elapsed, but since empty windows are dropped,
             * it is not necessary to account those.　　　　　　//如果不導(dǎo)致busy，并且有new window了，那么說(shuō)明繁忙的工作已經(jīng)完成了，很多有數(shù)據(jù)的window已經(jīng)流逝了，并且空閑的window被drop了，所以，只需要調(diào)用update。
             */
            update_history(rq, p, p->ravg.sum, 1, event);　　//（1.1）update上次的計(jì)算結(jié)果,長(zhǎng)度：p->ravg.sum
        return 0;
    }

    if (!new_window) {
        /*
         * The simple case - busy time contained within the existing
         * window.
         */
        return add_to_task_demand(rq, p, wallclock - mark_start);　　//（1.2）情況a：最簡(jiǎn)單的情況，直接更新wallclock - mark_start
    }

    /*
     * Busy time spans at least two windows. Temporarily rewind
     * window_start to first window boundary after mark_start.
     */
    delta = window_start - mark_start;　　　　　　　　　　　　　　　　   //busy time跨越至少2個(gè)window時(shí)，先臨時(shí)將window_start移動(dòng)到mark_start緊接著后面的window邊界處，記作ws_tmp
    nr_full_windows = div64_u64(delta, window_size);　　　　　　　　　　
    window_start -= (u64)nr_full_windows * (u64)window_size;　　　　

    /* Process (window_start - mark_start) first */
    runtime = add_to_task_demand(rq, p, window_start - mark_start);　　//先計(jì)算ws_tmp - mark_start

    /* Push new sample(s) into task's demand history */
    update_history(rq, p, p->ravg.sum, 1, event);　　　　　　　　　　　　 //更新history，長(zhǎng)度：p->ravg.sum
    if (nr_full_windows) {　　　　　　　　　　　　　　　　　　　　　　　　　　//如上面，其中有遺留的完整window，也需要更新，長(zhǎng)度：nr_full_windows * scaled_window; scaled_window由window_size，cpu freq等參數(shù)計(jì)算轉(zhuǎn)化而來(lái)
        u64 scaled_window = scale_exec_time(window_size, rq);

        update_history(rq, p, scaled_window, nr_full_windows, event);
        runtime += nr_full_windows * scaled_window;
    }

    /*
     * Roll window_start back to current to process any remainder
     * in current window.
     */
    window_start += (u64)nr_full_windows * (u64)window_size;　　　　　　//將window_start重新從ws_tmp處移回原先的地方

    /* Process (wallclock - window_start) next */
    mark_start = window_start;
    runtime += add_to_task_demand(rq, p, wallclock - mark_start);　　//在計(jì)算wallclock - mark_start（其實(shí)就是window_start）

    return runtime;　　　　　　　　　　　　　　//runtime記錄的是task累計(jì)的時(shí)間
}

1.1更新history, 這里就會(huì)對(duì)history中的window值求平均，再根據(jù)policy來(lái)選擇最近的值 or 是最大值，還是平均值，還是最大與最近值中較大的值（默認(rèn)）

/*
 * Called when new window is starting for a task, to record cpu usage over
 * recently concluded window(s). Normally 'samples' should be 1. It can be > 1
 * when, say, a real-time task runs without preemption for several windows at a
 * stretch.
 */
static void update_history(struct rq *rq, struct task_struct *p,
             u32 runtime, int samples, int event)
{
    u32 *hist = &p->ravg.sum_history[0];
    int ridx, widx;
    u32 max = 0, avg, demand, pred_demand;
    u64 sum = 0;
    u16 demand_scaled, pred_demand_scaled;

    /* Ignore windows where task had no activity */
    if (!runtime || is_idle_task(p) || exiting_task(p) || !samples)　　//如果沒(méi)有活動(dòng)的task，例如：idle，exiting等，就不用計(jì)算
        goto done;

    /* Push new 'runtime' value onto stack */
    widx = sched_ravg_hist_size - 1;
    ridx = widx - samples;
    for (; ridx >= 0; --widx, --ridx) {　　　　　　　　　　　　//這個(gè)循環(huán)，主要將hist數(shù)組中較久的數(shù)據(jù)去除，留下較新的數(shù)據(jù)。留下的數(shù)據(jù)累計(jì)到sum中
        hist[widx] = hist[ridx];
        sum += hist[widx];
        if (hist[widx] > max)
            max = hist[widx];
    }

    for (widx = 0; widx < samples && widx < sched_ravg_hist_size; widx++) {　　//這個(gè)循環(huán)，主要將新的數(shù)據(jù)填充點(diǎn)hist數(shù)組中，并累計(jì)到sum中
        hist[widx] = runtime;
        sum += hist[widx];
        if (hist[widx] > max)
            max = hist[widx];
    }

    p->ravg.sum = 0;

    if (sysctl_sched_window_stats_policy == WINDOW_STATS_RECENT) {　　　　　　//根據(jù)policy選擇demand的計(jì)算方式，默認(rèn)policy為WINDOW_STATS_MAX_RECENT_AVG=2
        demand = runtime;
    } else if (sysctl_sched_window_stats_policy == WINDOW_STATS_MAX) {
        demand = max;
    } else {
        avg = div64_u64(sum, sched_ravg_hist_size);　　　　　　　　　　//計(jì)算avg均值
        if (sysctl_sched_window_stats_policy == WINDOW_STATS_AVG)
            demand = avg;
        else
            demand = max(avg, runtime);　　　　//默認(rèn)的policy
    }
    pred_demand = predict_and_update_buckets(rq, p, runtime);　　　　//（1.1.1）根據(jù)當(dāng)前的數(shù)據(jù)，計(jì)算預(yù)測(cè)的demand
    demand_scaled = scale_demand(demand);　　　　　　　　　　　　　　　　//歸一化demand
    pred_demand_scaled = scale_demand(pred_demand);　　　　　　　　　　//歸一化預(yù)測(cè)的demand

    /*
     * A throttled deadline sched class task gets dequeued without
     * changing p->on_rq. Since the dequeue decrements walt stats
     * avoid decrementing it here again.
     *
     * When window is rolled over, the cumulative window demand
     * is reset to the cumulative runnable average (contribution from
     * the tasks on the runqueue). If the current task is dequeued
     * already, it's demand is not included in the cumulative runnable
     * average. So add the task demand separately to cumulative window
     * demand.
     */

　　　　/* 上面這段話的目的是校正參數(shù)，分兩種情況。第一種，task在rq queue里面，上次task
　　　　　　的demand為x，本次計(jì)算為y，則cpu負(fù)載:cumulative_runnable_avg_scaled += (y-x)。
　　　　　　第二種情況task不在rq queue里面，并且當(dāng)前task是本次計(jì)算demand的task,則直接計(jì)算
　　　　　　window load，cum_window_demand_scaled += y;

　　　　　　總結(jié)上面一句話：新task，它的demand直接累加到累計(jì)的demand變量中；而原task的demand發(fā)生變化，那么就要把該task的增減量delta，更新到累計(jì)的demand變量中。

　　　　　　****這個(gè)cum_window_demand_scaled數(shù)據(jù)和cumulative_runnable_avg_scaled是體現(xiàn)cpu untilization的一種體現(xiàn)***
　　　　*/

    if (!task_has_dl_policy(p) || !p->dl.dl_throttled) {　　　　
        if (task_on_rq_queued(p) &&
                p->sched_class->fixup_walt_sched_stats)
            p->sched_class->fixup_walt_sched_stats(rq, p,
                    demand_scaled, pred_demand_scaled);
        else if (rq->curr == p)
            walt_fixup_cum_window_demand(rq, demand_scaled);
    }

    p->ravg.demand = demand;　　　　　　　　　　　　　　　　　　　　　　　　//更新ravg結(jié)構(gòu)體中相關(guān)數(shù)據(jù)參數(shù)
    p->ravg.demand_scaled = demand_scaled;
    p->ravg.coloc_demand = div64_u64(sum, sched_ravg_hist_size);　　//這里有點(diǎn)沒(méi)明白，為什么又作一次計(jì)算，更新colocation demand。這個(gè)demand，肯定是更實(shí)時(shí)。
    p->ravg.pred_demand = pred_demand;
    p->ravg.pred_demand_scaled = pred_demand_scaled;

    if (demand_scaled > sched_task_filter_util)　　　　　　　　　　　　//demand_scaled > 35(0.68ms, default for 20ms window size scaled to 1024)
        p->unfilter = sysctl_sched_task_unfilter_nr_windows;　　　　//如果demand超過(guò)了，那么就放開(kāi)task遷移到更大的cpu的其一限制（限制條件不只這一條，這只是其中之一），并維持10次cnt
    else
        if (p->unfilter)
            p->unfilter = p->unfilter - 1;　　　　　　　　　　　　　　 //沒(méi)達(dá)到，就會(huì)cnt減1。為0后，task就會(huì)不滿足up migrate的條件

done:
    trace_sched_update_history(rq, p, runtime, samples, event);　　//trace log
}

1.1.1 預(yù)測(cè)demand，并更新buckets

static inline u32 predict_and_update_buckets(struct rq *rq,
            struct task_struct *p, u32 runtime) {

    int bidx;
    u32 pred_demand;

    if (!sched_predl)
        return 0;

    bidx = busy_to_bucket(runtime);　　　　　　　　　　　　　　//將runtime桶化成busy的等級(jí)：1～9，數(shù)字越大，越busy
    pred_demand = get_pred_busy(rq, p, bidx, runtime);　　 //計(jì)算預(yù)測(cè)demand，看下面的詳細(xì)解析
    bucket_increase(p->ravg.busy_buckets, bidx);　　　　　  //更新bucket，詳細(xì)解析看下面

    return pred_demand;
}

計(jì)算預(yù)測(cè)demand（預(yù)測(cè)的demand主要用于EAS）

/*
 * get_pred_busy - calculate predicted demand for a task on runqueue
 *
 * @rq: runqueue of task p
 * @p: task whose prediction is being updated
 * @start: starting bucket. returned prediction should not be lower than
 *         this bucket.
 * @runtime: runtime of the task. returned prediction should not be lower
 *           than this runtime.
 * Note: @start can be derived from @runtime. It's passed in only to
 * avoid duplicated calculation in some cases.
 *
 * A new predicted busy time is returned for task @p based on @runtime
 * passed in. The function searches through buckets that represent busy
 * time equal to or bigger than @runtime and attempts to find the bucket to
 * to use for prediction. Once found, it searches through historical busy
 * time and returns the latest that falls into the bucket. If no such busy
 * time exists, it returns the medium of that bucket.
 */
static u32 get_pred_busy(struct rq *rq, struct task_struct *p,
                int start, u32 runtime)
{
    int i;
    u8 *buckets = p->ravg.busy_buckets;
    u32 *hist = p->ravg.sum_history;
    u32 dmin, dmax;
    u64 cur_freq_runtime = 0;
    int first = NUM_BUSY_BUCKETS, final;
    u32 ret = runtime;

    /* skip prediction for new tasks due to lack of history */
    if (unlikely(is_new_task(p)))　　　　　　　　　　　　　　　　　　//new task沒(méi)有history，所以不用作預(yù)測(cè)
        goto out;

    /* find minimal bucket index to pick */
    for (i = start; i < NUM_BUSY_BUCKETS; i++) {　　　　　　　　//找到第一個(gè)非0值bucket的index
        if (buckets[i]) {
            first = i;
            break;
        }
    }
    /* if no higher buckets are filled, predict runtime */
    if (first >= NUM_BUSY_BUCKETS)　　　　　　　　　　　　　　　　//這個(gè)條件應(yīng)該永遠(yuǎn)不會(huì)滿足，因?yàn)閕ndex最大為9
        goto out;

    /* compute the bucket for prediction */
    final = first;

    /* determine demand range for the predicted bucket */
    if (final < 2) {　　　　　　　　　　　　　　　　　　　　　　　　//如果index是最小的0、1，那么就直接設(shè)為1。因?yàn)椴豢赡艹霈F(xiàn)比最小還小
        /* lowest two buckets are combined */
        dmin = 0;
        final = 1;
    } else {
        dmin = mult_frac(final, max_task_load(), NUM_BUSY_BUCKETS);　　//反向計(jì)算，還原final index對(duì)應(yīng)runtime
    }
    dmax = mult_frac(final + 1, max_task_load(), NUM_BUSY_BUCKETS);　　//反向計(jì)算，還原final+1 index對(duì)應(yīng)的runtime

    /*
     * search through runtime history and return first runtime that falls
     * into the range of predicted bucket.
     */
    for (i = 0; i < sched_ravg_hist_size; i++) {　　　　　　　　　　//在history中尋找，最新的一個(gè)能滿足runtime區(qū)間的記錄
        if (hist[i] >= dmin && hist[i] < dmax) {
            ret = hist[i];
            break;
        }
    }
    /* no historical runtime within bucket found, use average of the bin */
    if (ret < dmin)　　　　　　　　　　　　　　　　　　　　　　　　　　　//沒(méi)有找到，那么使用區(qū)間的中值
        ret = (dmin + dmax) / 2;
    /*
     * when updating in middle of a window, runtime could be higher
     * than all recorded history. Always predict at least runtime.
     */
    ret = max(runtime, ret);　　　　　　　　　　　　　　　　　　　　　//保持預(yù)測(cè)的值不小于原先的runtime
out:
    trace_sched_update_pred_demand(rq, p, runtime,
        mult_frac((unsigned int)cur_freq_runtime, 100,
              sched_ravg_window), ret);
    return ret;
}

bucket_increase用于更新bucket，如果index匹配，那么就會(huì)增加8/16（small step/big step），但不會(huì)超過(guò)最大255。

如果index不匹配，那么就會(huì)自動(dòng)進(jìn)行衰減2，直到減為0。

#define INC_STEP 8
#define DEC_STEP 2
#define CONSISTENT_THRES 16
#define INC_STEP_BIG 16
/*
 * bucket_increase - update the count of all buckets
 *
 * @buckets: array of buckets tracking busy time of a task
 * @idx: the index of bucket to be incremented
 *
 * Each time a complete window finishes, count of bucket that runtime
 * falls in (@idx) is incremented. Counts of all other buckets are
 * decayed. The rate of increase and decay could be different based
 * on current count in the bucket.
 */
static inline void bucket_increase(u8 *buckets, int idx)
{
    int i, step;

    for (i = 0; i < NUM_BUSY_BUCKETS; i++) {
        if (idx != i) {
            if (buckets[i] > DEC_STEP)
                buckets[i] -= DEC_STEP;
            else
                buckets[i] = 0;
        } else {
            step = buckets[i] >= CONSISTENT_THRES ?
                        INC_STEP_BIG : INC_STEP;
            if (buckets[i] > U8_MAX - step)
                buckets[i] = U8_MAX;
            else
                buckets[i] += step;
        }
    }
}

——————————

1.2 add_to_task_demand比較簡(jiǎn)單，就是將task運(yùn)行占用的時(shí)間歸一化，然后累計(jì)到ravg.sum，但是ravg.sum不能超過(guò)sched_ravg_window（20ms）。

static u64 add_to_task_demand(struct rq *rq, struct task_struct *p, u64 delta)
{
    delta = scale_exec_time(delta, rq);　　//這里就會(huì)用到之前的cpu cycle count
    p->ravg.sum += delta;
    if (unlikely(p->ravg.sum > sched_ravg_window))
        p->ravg.sum = sched_ravg_window;

    return delta;
}

=================

2. 在cpu活動(dòng)時(shí)，更新cpu busy time（rq->curr/prev_runnable_sum）

/*
 * Account cpu activity in its busy time counters (rq->curr/prev_runnable_sum)
 */
static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
                 int event, u64 wallclock, u64 irqtime)
{
    int new_window, full_window = 0;
    int p_is_curr_task = (p == rq->curr);
    u64 mark_start = p->ravg.mark_start;
    u64 window_start = rq->window_start;
    u32 window_size = sched_ravg_window;
    u64 delta;
    u64 *curr_runnable_sum = &rq->curr_runnable_sum;
    u64 *prev_runnable_sum = &rq->prev_runnable_sum;
    u64 *nt_curr_runnable_sum = &rq->nt_curr_runnable_sum;
    u64 *nt_prev_runnable_sum = &rq->nt_prev_runnable_sum;
    bool new_task;
    struct related_thread_group *grp;
    int cpu = rq->cpu;
    u32 old_curr_window = p->ravg.curr_window;

    new_window = mark_start < window_start;
    if (new_window) {
        full_window = (window_start - mark_start) >= window_size;　　//full window代表距離上次更新較久了（>20ms）
        if (p->ravg.active_windows < USHRT_MAX)
            p->ravg.active_windows++;
    }

    new_task = is_new_task(p);　　　　　　　　　　//更具ravg.active_windows判斷是否是<5,則便是原先空的window，現(xiàn)在有新task填充。僅發(fā)生在剛開(kāi)始初始化階段

    /*
     * Handle per-task window rollover. We don't care about the idle
     * task or exiting tasks.
     */
    if (!is_idle_task(p) && !exiting_task(p)) {　　　　　　　　//idle、exiting task的rollover，無(wú)需考慮
        if (new_window)
            rollover_task_window(p, full_window);　　　　　　//task rollover實(shí)際就是轉(zhuǎn)存下window，把curr存到pre。并且轉(zhuǎn)存per cpu的prev_window_cpu[]
    }

    if (p_is_curr_task && new_window) {
        rollover_cpu_window(rq, full_window);　　　　　　　　//轉(zhuǎn)存rq、rq->grp_time相關(guān)的curr_runnable_sum、nt_curr_runnable_sum
        rollover_top_tasks(rq, full_window);　　　　　　　　 //轉(zhuǎn)存top_tass_table和curr_top task
    }

    if (!account_busy_for_cpu_time(rq, p, irqtime, event))　　//非busy（可能正在migrate，idle等）則直接update top tasks
        goto done;

    grp = p->grp;
    if (grp) {　　　　　　　　　　　　　　　　　　　　　　　　　　　　　//如果task有releated_thread_group，那么使用grp_time的curr_runnable_sum和nt_curr_runnable_sum
        struct group_cpu_time *cpu_time = &rq->grp_time;
        curr_runnable_sum = &cpu_time->curr_runnable_sum;
        prev_runnable_sum = &cpu_time->prev_runnable_sum;

        nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
        nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
    }

    if (!new_window) {
        /*
         * account_busy_for_cpu_time() = 1 so busy time needs
         * to be accounted to the current window. No rollover
         * since we didn't start a new window. An example of this is
         * when a task starts execution and then sleeps within the
         * same window.　　　　***task執(zhí)行，然后接著就sleep；動(dòng)作發(fā)生在通一個(gè)window中，這個(gè)情況下不需要rollover***
         */

        if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq))
            delta = wallclock - mark_start;　　　　　　　　　　　　　　　　//非中斷、idle task、cpu等待io的情況下，delta的值
        else
            delta = irqtime;　　　　　　　　　　　　　　　　　　　　　　　　　//中斷、idle task、cpu等待io的情況下，delta的值
        delta = scale_exec_time(delta, rq);
        *curr_runnable_sum += delta;　　　　　　　　　　　　//歸一化后，統(tǒng)計(jì)到curr_runnable_sum
        if (new_task)
            *nt_curr_runnable_sum += delta;　　　　　　　　//如果是new task，也累加到nt_curr_rannable_sum中

        if (!is_idle_task(p) && !exiting_task(p)) {
            p->ravg.curr_window += delta;　　　　　　　　  //更新curr_window和對(duì)應(yīng)cpu的數(shù)組curr_window_cpu[cpu]
            p->ravg.curr_window_cpu[cpu] += delta;　　　　
        }

        goto done;
    }

    if (!p_is_curr_task) {
        /*
         * account_busy_for_cpu_time() = 1 so busy time needs
         * to be accounted to the current window. A new window
         * has also started, but p is not the current task, so the
         * window is not rolled over - just split up and account
         * as necessary into curr and prev. The window is only
         * rolled over when a new window is processed for the current
         * task.
         *
         * Irqtime can't be accounted by a task that isn't the
         * currently running task.　　　　***p不是current task的情況，irqtime不能被統(tǒng)計(jì)到p***
         */

        if (!full_window) {
            /*
             * A full window hasn't elapsed, account partial
             * contribution to previous completed window.  　　***沒(méi)有出現(xiàn)過(guò)了full window的時(shí)候，僅更新到prev_window***
             */
            delta = scale_exec_time(window_start - mark_start, rq);　　//更新的大小：window_start - mark_start
            if (!exiting_task(p)) {
                p->ravg.prev_window += delta;
                p->ravg.prev_window_cpu[cpu] += delta;
            }
        } else {
            /*
             * Since at least one full window has elapsed,
             * the contribution to the previous window is the
             * full window (window_size).
             */
            delta = scale_exec_time(window_size, rq);　　　　　　　　//出現(xiàn)了full window的情況下，更新的大小：window_size，20ms
            if (!exiting_task(p)) {
                p->ravg.prev_window = delta;
                p->ravg.prev_window_cpu[cpu] = delta;
            }
        }

        *prev_runnable_sum += delta;　　　　　　　　　　　　　　//再更新prev_runnable_sum和nt_prev_runnable_sum
        if (new_task)
            *nt_prev_runnable_sum += delta;

        /* Account piece of busy time in the current window. */
        delta = scale_exec_time(wallclock - window_start, rq);　　//再統(tǒng)計(jì)curr window到curr_runnable_sum和nt_curr_runnable_sum
        *curr_runnable_sum += delta;
        if (new_task)
            *nt_curr_runnable_sum += delta;

        if (!exiting_task(p)) {　　　　　　　　　　　　　　　　　　//并更新curr_window和對(duì)應(yīng)cpu的數(shù)組curr_window_cpu[cpu]
            p->ravg.curr_window = delta;
            p->ravg.curr_window_cpu[cpu] = delta;
        }

        goto done;
    }

    if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq)) {
        /*
         * account_busy_for_cpu_time() = 1 so busy time needs
         * to be accounted to the current window. A new window
         * has started and p is the current task so rollover is
         * needed. If any of these three above conditions are true
         * then this busy time can't be accounted as irqtime.
         *
         * Busy time for the idle task or exiting tasks need not
         * be accounted.
         *
         * An example of this would be a task that starts execution
         * and then sleeps once a new window has begun.　　　　***一個(gè)task開(kāi)始執(zhí)行，然后在1個(gè)new window開(kāi)始時(shí)，sleep了（也是類似統(tǒng)計(jì)curr_rannable_sum等）***
         */

        if (!full_window) {
            /*
             * A full window hasn't elapsed, account partial
             * contribution to previous completed window.
             */
            delta = scale_exec_time(window_start - mark_start, rq);
            if (!is_idle_task(p) && !exiting_task(p)) {
                p->ravg.prev_window += delta;
                p->ravg.prev_window_cpu[cpu] += delta;
            }
        } else {
            /*
             * Since at least one full window has elapsed,
             * the contribution to the previous window is the
             * full window (window_size).
             */
            delta = scale_exec_time(window_size, rq);
            if (!is_idle_task(p) && !exiting_task(p)) {
                p->ravg.prev_window = delta;
                p->ravg.prev_window_cpu[cpu] = delta;
            }
        }

        /*
         * Rollover is done here by overwriting the values in
         * prev_runnable_sum and curr_runnable_sum.
         */
        *prev_runnable_sum += delta;
        if (new_task)
            *nt_prev_runnable_sum += delta;

        /* Account piece of busy time in the current window. */
        delta = scale_exec_time(wallclock - window_start, rq);
        *curr_runnable_sum += delta;
        if (new_task)
            *nt_curr_runnable_sum += delta;

        if (!is_idle_task(p) && !exiting_task(p)) {
            p->ravg.curr_window = delta;
            p->ravg.curr_window_cpu[cpu] = delta;
        }

        goto done;
    }

    if (irqtime) {　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　//scheduler_tick函數(shù)觸發(fā)的時(shí)候，irqtime=0
        /*
         * account_busy_for_cpu_time() = 1 so busy time needs
         * to be accounted to the current window. A new window
         * has started and p is the current task so rollover is
         * needed. The current task must be the idle task because
         * irqtime is not accounted for any other task.
         *
         * Irqtime will be accounted each time we process IRQ activity
         * after a period of idleness, so we know the IRQ busy time
         * started at wallclock - irqtime.　　　　***irq發(fā)生的情況下，統(tǒng)計(jì)busy time（也是類似統(tǒng)計(jì)curr_rannable_sum等）***
         */

        BUG_ON(!is_idle_task(p));
        mark_start = wallclock - irqtime;

        /*
         * Roll window over. If IRQ busy time was just in the current
         * window then that is all that need be accounted.
         */
        if (mark_start > window_start) {
            *curr_runnable_sum = scale_exec_time(irqtime, rq);
            return;
        }

        /*
         * The IRQ busy time spanned multiple windows. Process the
         * busy time preceding the current window start first.
         */
        delta = window_start - mark_start;
        if (delta > window_size)
            delta = window_size;
        delta = scale_exec_time(delta, rq);
        *prev_runnable_sum += delta;

        /* Process the remaining IRQ busy time in the current window. */
        delta = wallclock - window_start;
        rq->curr_runnable_sum = scale_exec_time(delta, rq);

        return;
    }

done:
    if (!is_idle_task(p) && !exiting_task(p))
        update_top_tasks(p, rq, old_curr_window,
                    new_window, full_window);　　　　　　(2.1)更新cpu top task
}

2.1 更新top task，維護(hù)curr_table/prev_table

static void update_top_tasks(struct task_struct *p, struct rq *rq,
        u32 old_curr_window, int new_window, bool full_window)
{
    u8 curr = rq->curr_table;
    u8 prev = 1 - curr;
    u8 *curr_table = rq->top_tasks[curr];
    u8 *prev_table = rq->top_tasks[prev];
    int old_index, new_index, update_index;
    u32 curr_window = p->ravg.curr_window;
    u32 prev_window = p->ravg.prev_window;
    bool zero_index_update;

    if (old_curr_window == curr_window && !new_window)
        return;

    old_index = load_to_index(old_curr_window);　　　　　　//把load轉(zhuǎn)化為index
    new_index = load_to_index(curr_window);

    if (!new_window) {　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　//在沒(méi)有new window的情況下，更新當(dāng)前top表rq->curr_table[]中新舊index的計(jì)數(shù),
        zero_index_update = !old_curr_window && curr_window;　　　　　　　//根據(jù)curr_table[新舊index]的計(jì)數(shù)，更新rq->top_tasks_bitmap[curr] bitmap中對(duì)應(yīng)index的值
        if (old_index != new_index || zero_index_update) {
            if (old_curr_window)
                curr_table[old_index] -= 1;
            if (curr_window)
                curr_table[new_index] += 1;
            if (new_index > rq->curr_top)
                rq->curr_top = new_index;
        }

        if (!curr_table[old_index])
            __clear_bit(NUM_LOAD_INDICES - old_index - 1,
                rq->top_tasks_bitmap[curr]);

        if (curr_table[new_index] == 1)
            __set_bit(NUM_LOAD_INDICES - new_index - 1,
                rq->top_tasks_bitmap[curr]);

        return;
    }

    /*
     * The window has rolled over for this task. By the time we get
     * here, curr/prev swaps would has already occurred. So we need
     * to use prev_window for the new index.　　　　　　　　　　　　***有new window的情況下，分2部分計(jì)算，一部分時(shí)prev_window；另一部分時(shí)curr_window***
     */
    update_index = load_to_index(prev_window);

    if (full_window) {
        /*
         * Two cases here. Either 'p' ran for the entire window or
         * it didn't run at all. In either case there is no entry
         * in the prev table. If 'p' ran the entire window, we just
         * need to create a new entry in the prev table. In this case
         * update_index will be correspond to sched_ravg_window
         * so we can unconditionally update the top index.
         */
        if (prev_window) {
            prev_table[update_index] += 1;
            rq->prev_top = update_index;
        }

        if (prev_table[update_index] == 1)
            __set_bit(NUM_LOAD_INDICES - update_index - 1,
                rq->top_tasks_bitmap[prev]);
    } else {
        zero_index_update = !old_curr_window && prev_window;
        if (old_index != update_index || zero_index_update) {
            if (old_curr_window)
                prev_table[old_index] -= 1;

            prev_table[update_index] += 1;

            if (update_index > rq->prev_top)
                rq->prev_top = update_index;

            if (!prev_table[old_index])
                __clear_bit(NUM_LOAD_INDICES - old_index - 1,
                        rq->top_tasks_bitmap[prev]);

            if (prev_table[update_index] == 1)
                __set_bit(NUM_LOAD_INDICES - update_index - 1,
                        rq->top_tasks_bitmap[prev]);
        }
    }

    if (curr_window) {
        curr_table[new_index] += 1;

        if (new_index > rq->curr_top)
            rq->curr_top = new_index;

        if (curr_table[new_index] == 1)
            __set_bit(NUM_LOAD_INDICES - new_index - 1,
                rq->top_tasks_bitmap[curr]);
    }
}

=================

3.在window滾動(dòng)期間，當(dāng)task busy time超過(guò)了預(yù)測(cè)的demand，那么就要更新預(yù)測(cè)的demand。

/*
 * predictive demand of a task is calculated at the window roll-over.
 * if the task current window busy time exceeds the predicted
 * demand, update it here to reflect the task needs.
 */
void update_task_pred_demand(struct rq *rq, struct task_struct *p, int event)
{
    u32 new, old;
    u16 new_scaled;

    if (!sched_predl)
        return;

    if (is_idle_task(p) || exiting_task(p))
        return;

    if (event != PUT_PREV_TASK && event != TASK_UPDATE &&
            (!SCHED_FREQ_ACCOUNT_WAIT_TIME ||
             (event != TASK_MIGRATE &&
             event != PICK_NEXT_TASK)))
        return;

    /*
     * TASK_UPDATE can be called on sleeping task, when its moved between
     * related groups
     */
    if (event == TASK_UPDATE) {
        if (!p->on_rq && !SCHED_FREQ_ACCOUNT_WAIT_TIME)
            return;
    }

    new = calc_pred_demand(rq, p);　　//計(jì)算新的demand，計(jì)算方法與1.1.1的get_pred_busy的一樣
    old = p->ravg.pred_demand;

    if (old >= new)　　　　　　　　　　//不需要更新就return
        return;

    new_scaled = scale_demand(new);
    if (task_on_rq_queued(p) && (!task_has_dl_policy(p) ||　　　　　　　　　　//更新cum_window_demand_scaled，與1.1中一樣
                !p->dl.dl_throttled) &&
                p->sched_class->fixup_walt_sched_stats)
        p->sched_class->fixup_walt_sched_stats(rq, p,
                p->ravg.demand_scaled,
                new_scaled);

    p->ravg.pred_demand = new;　　　　　　　　　　　　　　//更新預(yù)測(cè)的pred_demand和pred_demand_sclaed
    p->ravg.pred_demand_scaled = new_scaled;
}

=================

4. 并且根據(jù)判斷是否需要調(diào)節(jié)cpu freq。如果是task遷移，還需要判斷是否wake up cluster/core。

static inline void run_walt_irq_work(u64 old_window_start, struct rq *rq)
{
    u64 result;

    if (old_window_start == rq->window_start)　　　　　　//過(guò)濾，防止循環(huán)調(diào)用
        return;

    result = atomic64_cmpxchg(&walt_irq_work_lastq_ws, old_window_start,
                   rq->window_start);
    if (result == old_window_start)
        irq_work_queue(&walt_cpufreq_irq_work);　　//調(diào)用walt_irq_work
}

static void walt_init_once(void)
{
...
    init_irq_work(&walt_cpufreq_irq_work, walt_irq_work);
...
}

/*
 * Runs in hard-irq context. This should ideally run just after the latest
 * window roll-over.
 */
void walt_irq_work(struct irq_work *irq_work)
{
    struct sched_cluster *cluster;
    struct rq *rq;
    int cpu;
    u64 wc;
    bool is_migration = false, is_asym_migration = false;
    u64 total_grp_load = 0, min_cluster_grp_load = 0;
    int level = 0;

    /* Am I the window rollover work or the migration work? */
    if (irq_work == &walt_migration_irq_work)
        is_migration = true;

    for_each_cpu(cpu, cpu_possible_mask) {
        if (level == 0)
            raw_spin_lock(&cpu_rq(cpu)->lock);
        else
            raw_spin_lock_nested(&cpu_rq(cpu)->lock, level);
        level++;
    }

    wc = sched_ktime_clock();
    walt_load_reported_window = atomic64_read(&walt_irq_work_lastq_ws);
    for_each_sched_cluster(cluster) {　　　　　　　　　　　　　　　　　　　　　　　　　　　　//遍歷每個(gè)cluster、每個(gè)cpu
        u64 aggr_grp_load = 0;

        raw_spin_lock(&cluster->load_lock);

        for_each_cpu(cpu, &cluster->cpus) {
            rq = cpu_rq(cpu);
            if (rq->curr) {
                update_task_ravg(rq->curr, rq,　　　　　　　　　　　　　　　　　　//調(diào)用update_task_tavg，更新
                        TASK_UPDATE, wc, 0);
                account_load_subtractions(rq);　　　　　　　　　　　　　　　　　　//統(tǒng)計(jì)curr/prev_runnable_sum、nt_curr/prev_runnable_sum減去load_subtracion；衰減可能時(shí)為了防止參數(shù)一直往上漲
                aggr_grp_load += rq->grp_time.prev_runnable_sum;　　　　　　　//統(tǒng)計(jì)cluster的load
            }
            if (is_migration && rq->notif_pending &&
                cpumask_test_cpu(cpu, &asym_cap_sibling_cpus)) {
                is_asym_migration = true;
                rq->notif_pending = false;
            }
        }

        cluster->aggr_grp_load = aggr_grp_load;
        total_grp_load += aggr_grp_load;　　　　　　　　　　//統(tǒng)計(jì)總load

        if (is_min_capacity_cluster(cluster))
            min_cluster_grp_load = aggr_grp_load;
        raw_spin_unlock(&cluster->load_lock);
    }

    if (total_grp_load) {
        if (cpumask_weight(&asym_cap_sibling_cpus)) {
            u64 big_grp_load =
                      total_grp_load - min_cluster_grp_load;

            for_each_cpu(cpu, &asym_cap_sibling_cpus)
                cpu_cluster(cpu)->aggr_grp_load = big_grp_load;
        }
        rtgb_active = is_rtgb_active();
    } else {
        rtgb_active = false;
    }

    if (!is_migration && sysctl_sched_user_hint && time_after(jiffies,
                    sched_user_hint_reset_time))
        sysctl_sched_user_hint = 0;

    for_each_sched_cluster(cluster) {
        cpumask_t cluster_online_cpus;
        unsigned int num_cpus, i = 1;

        cpumask_and(&cluster_online_cpus, &cluster->cpus,
                        cpu_online_mask);
        num_cpus = cpumask_weight(&cluster_online_cpus);
        for_each_cpu(cpu, &cluster_online_cpus) {
            int flag = SCHED_CPUFREQ_WALT;

            rq = cpu_rq(cpu);

            if (is_migration) {
                if (rq->notif_pending) {
                    flag |= SCHED_CPUFREQ_INTERCLUSTER_MIG;
                    rq->notif_pending = false;
                }
            }

            if (is_asym_migration && cpumask_test_cpu(cpu,
                            &asym_cap_sibling_cpus))
                flag |= SCHED_CPUFREQ_INTERCLUSTER_MIG;

            if (i == num_cpus)
                cpufreq_update_util(cpu_rq(cpu), flag);　　//調(diào)節(jié)cpu freq
            else
                cpufreq_update_util(cpu_rq(cpu), flag |　　//flag：維持cpu freq
                            SCHED_CPUFREQ_CONTINUE);
            i++;
        }
    }

    for_each_cpu(cpu, cpu_possible_mask)
        raw_spin_unlock(&cpu_rq(cpu)->lock);

    if (!is_migration)
        core_ctl_check(this_rq()->window_start);　　//task遷移的話，要確認(rèn)是否需要wake up cluster/core
}

　　2. IRQ load統(tǒng)計(jì)

在irq觸發(fā)時(shí)，會(huì)調(diào)用到函數(shù)：irqtime_account_irq

/*
 * Called before incrementing preempt_count on {soft,}irq_enter
 * and before decrementing preempt_count on {soft,}irq_exit.
 */
void irqtime_account_irq(struct task_struct *curr)
{
...
#ifdef CONFIG_SCHED_WALT
    u64 wallclock;
    bool account = true;
#endif
...
#ifdef CONFIG_SCHED_WALT
    wallclock = sched_clock_cpu(cpu);
#endif
    delta = sched_clock_cpu(cpu) - irqtime->irq_start_time;
    irqtime->irq_start_time += delta;

    /*
     * We do not account for softirq time from ksoftirqd here.
     * We want to continue accounting softirq time to ksoftirqd thread
     * in that case, so as not to confuse scheduler with a special task
     * that do not consume any time, but still wants to run.　　　　***我們不想統(tǒng)計(jì)從ksoftirqd跑到這里的softirq時(shí)間。但仍然會(huì)基線統(tǒng)計(jì)跑到ksoftirqd線程的softirq時(shí)間***
     */
    if (hardirq_count())
        irqtime_account_delta(irqtime, delta, CPUTIME_IRQ);
    else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
        irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ);
#ifdef CONFIG_SCHED_WALT
    else
        account = false;

    if (account)
        sched_account_irqtime(cpu, curr, delta, wallclock);　　//統(tǒng)計(jì)irqtime
    else if (curr != this_cpu_ksoftirqd())
        sched_account_irqstart(cpu, curr, wallclock);
#endif
}

其中delta是irq運(yùn)行的時(shí)間，因?yàn)閐elta原先數(shù)值是irq開(kāi)始時(shí)間到執(zhí)行函數(shù)irqtime_account_irq的時(shí)間差值，現(xiàn)在執(zhí)行到sched_account_irqtime函數(shù)，由于中間經(jīng)過(guò)了很多代碼指令的執(zhí)行，再次校正delta數(shù)值：delta += sched_clock - wallclock(上次系統(tǒng)時(shí)間)

void sched_account_irqtime(int cpu, struct task_struct *curr,
                 u64 delta, u64 wallclock)
{
    struct rq *rq = cpu_rq(cpu);
    unsigned long flags, nr_windows;
    u64 cur_jiffies_ts;

    raw_spin_lock_irqsave(&rq->lock, flags);

    /*
     * cputime (wallclock) uses sched_clock so use the same here for
     * consistency.
     */
    delta += sched_clock() - wallclock;　　　　　　//更新跑到這里的irqtime
    cur_jiffies_ts = get_jiffies_64();

    if (is_idle_task(curr))　　　　　　　　　　　　　　　　　　　　　　　　　//如果變?yōu)閕dle了，更新task load/cpu load等。
        update_task_ravg(curr, rq, IRQ_UPDATE, sched_ktime_clock(),
                 delta);

    nr_windows = cur_jiffies_ts - rq->irqload_ts;　　　　　　　　//計(jì)算當(dāng)前irq的時(shí)間距離上次irq的間隔jiffies

    if (nr_windows) {
        if (nr_windows < 10) {　　　　　　　　　　　　　　　　　　　　//如果間隔時(shí)間小于10個(gè)window，就要衰減為原先的3/4：avg_irq_load = 原先avg_irqload * 0.75
            /* Decay CPU's irqload by 3/4 for each window. */
            rq->avg_irqload *= (3 * nr_windows);
            rq->avg_irqload = div64_u64(rq->avg_irqload,
                            4 * nr_windows);
        } else {
            rq->avg_irqload = 0;　　　　　　　　//間隔>=10個(gè)window，avg_irq_load = 0（總結(jié)：如果一個(gè)rq上的cpu irq中斷時(shí)間間隔比較長(zhǎng)，那么它的avg_irqload就可以忽略不計(jì)）
        }
        rq->avg_irqload += rq->cur_irqload;　　//統(tǒng)計(jì)好cur_irqload，清0，為了下面更新此值
        rq->cur_irqload = 0;
    }

    rq->cur_irqload += delta;　　　　　　　　　　//更新cur_irqload
    rq->irqload_ts = cur_jiffies_ts;　　　　　　//更新irqload時(shí)間戳
    raw_spin_unlock_irqrestore(&rq->lock, flags);
}

irqload是否high的判斷：

__read_mostly unsigned int sysctl_sched_cpu_high_irqload = (10 * NSEC_PER_MSEC);

static inline int sched_cpu_high_irqload(int cpu)
{
    return sched_irqload(cpu) >= sysctl_sched_cpu_high_irqload;　　//10ms
}

#define SCHED_HIGH_IRQ_TIMEOUT 3

static inline u64 sched_irqload(int cpu)
{
    struct rq *rq = cpu_rq(cpu);
    s64 delta;

    delta = get_jiffies_64() - rq->irqload_ts;
    /*
     * Current context can be preempted by irq and rq->irqload_ts can be
     * updated by irq context so that delta can be negative.
     * But this is okay and we can safely return as this means there
     * was recent irq occurrence.
     */

    if (delta < SCHED_HIGH_IRQ_TIMEOUT)　　　　//如果當(dāng)前時(shí)間距離上次irq間隔>=3（應(yīng)該是3個(gè)tick），則認(rèn)為irq load為0
        return rq->avg_irqload;
    else
        return 0;
}

high irqload影響EAS：

調(diào)用路徑：find_energy_efficient_cpu() --> find_best_target() --> sched_cpu_high_irqload()

路徑是為了在sched domain里面找到最佳的cpu，之后將task遷移過(guò)去；如果 cpu irqload為 high，那么則說(shuō)明此cpu不合適，繼續(xù)遍歷其他cpu。這個(gè)也是負(fù)載均衡的一部分。

WALT結(jié)果用途

　　1. 負(fù)載均衡（task migration）

以can_migrate_task()函數(shù)為例：

通過(guò)task_util()獲取該task的demand，即task級(jí)負(fù)載

cpu_util_cum()獲取cpu rq的累計(jì)demand，即cpu級(jí)負(fù)載

如果 dst_cpu累計(jì)demand + task_demand > src_cpu累計(jì)demand + task_demand，那么說(shuō)明不滿足遷移條件。

/*
 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
 */
static
int can_migrate_task(struct task_struct *p, struct lb_env *env)
{
    ...
        demand = task_util(p);　　//獲取task負(fù)載
        util_cum_dst = cpu_util_cum(env->dst_cpu, 0) + demand;　　//cpu_util_cum獲取cpu負(fù)載
        util_cum_src = cpu_util_cum(env->src_cpu, 0) - demand;
    
        if (util_cum_dst > util_cum_src)
            return 0;
    ...
}

還有文章前面講到的irqload、預(yù)測(cè)的demand都會(huì)影響負(fù)載均衡。

　　2. CPU freq調(diào)節(jié)

一共有如下3條路徑的函數(shù)來(lái)通過(guò)WALT修改cpu freq，

walt_irq_work()：walt中斷工作
scheduler_tick()：周期調(diào)度中early detection情況（調(diào)度器發(fā)現(xiàn)已存在task處于runnable狀態(tài)超過(guò)了SCHED_EARLY_DETECTION_DURATION，那么調(diào)度器就會(huì)通知governor接下來(lái)，需要提高cpu freq，具體解釋如下）

A further enhancement during boost is the scheduler' early detection feature.
While boost is in effect the scheduler checks for the precence of tasks that
have been runnable for over some period of time within the tick. For such
tasks the scheduler informs the governor of imminent need for high frequency.
If there exists a task on the runqueue at the tick that has been runnable
for greater than SCHED_EARLY_DETECTION_DURATION amount of time, it notifies
the governor with a fabricated load of the full window at the highest
frequency. The fabricated load is maintained until the task is no longer
runnable or until the next tick.

try_to_wake_up()：進(jìn)程喚醒

walt_irq_work()  
scheduler_tick()  --> flag = SCHED_CPUFREQ_WALT --> cpufreq_update_util(cpu_rq(cpu),flag)
try_to_wake_up()  

調(diào)節(jié)頻率有兩種governoer，分別為原先的[CPU FREQ governors] 以及新的[schedtuil governors]

1、CPU FREQ governor
static void gov_set_update_util(struct policy_dbs_info *policy_dbs,
                unsigned int delay_us)
{
    ...
        cpufreq_add_update_util_hook(cpu, &cdbs->update_util,
                         dbs_update_util_handler);
    ...
}


2、[schedutil] cpu freq governors
static int sugov_start(struct cpufreq_policy *policy)
{
    ...
        cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util,
                         policy_is_shared(policy) ?
                            sugov_update_shared :
                            sugov_update_single);
    ...
}

當(dāng)前以schedutil為例：

其中，在walt.c的 freq_policy_load中，會(huì)返回WALT計(jì)算的load（util），用于sugov_next_freq_shared中計(jì)算新的cpu freq。

static inline u64 freq_policy_load(struct rq *rq)
{
    unsigned int reporting_policy = sysctl_sched_freq_reporting_policy;
    struct sched_cluster *cluster = rq->cluster;
    u64 aggr_grp_load = cluster->aggr_grp_load;
    u64 load, tt_load = 0;
    struct task_struct *cpu_ksoftirqd = per_cpu(ksoftirqd, cpu_of(rq));

    if (rq->ed_task != NULL) {　　　　　　//early detection task的情況，load = 一個(gè)window的長(zhǎng)度（20ms）
        load = sched_ravg_window;
        goto done;
    }

    if (sched_freq_aggr_en)　　　　　　　　　　　　　　　　　　　　//freq聚合影響不同的load計(jì)算，在sched_boost設(shè)置為full_throttle_boost和restrained_boost時(shí)enable，退出時(shí)disable
        load = rq->prev_runnable_sum + aggr_grp_load;
    else
        load = rq->prev_runnable_sum + rq->grp_time.prev_runnable_sum;

    if (cpu_ksoftirqd && cpu_ksoftirqd->state == TASK_RUNNING)
        load = max_t(u64, load, task_load(cpu_ksoftirqd));　　//如果正在執(zhí)行軟中斷的load

    tt_load = top_task_load(rq);　　　　　　　　　　　　　　　　//獲取top task的load
    switch (reporting_policy) {　　　　　　　　　　　　　　//根據(jù)不同的report policy，選擇load上報(bào)
    case FREQ_REPORT_MAX_CPU_LOAD_TOP_TASK:
        load = max_t(u64, load, tt_load);　　　　　　　　//取其中最大的load
        break;
    case FREQ_REPORT_TOP_TASK:
        load = tt_load;
        break;
    case FREQ_REPORT_CPU_LOAD:
        break;
    default:
        break;
    }

    if (should_apply_suh_freq_boost(cluster)) {　　　　　　//是否需要apply freq boost
        if (is_suh_max())
            load = sched_ravg_window;
        else
            load = div64_u64(load * sysctl_sched_user_hint,
                     (u64)100);
    }

done:
    trace_sched_load_to_gov(rq, aggr_grp_load, tt_load, sched_freq_aggr_en,
                load, reporting_policy, walt_rotation_enabled,
                sysctl_sched_user_hint);
    return load;
}

WALT vs PELT

總結(jié)一下WALT的優(yōu)點(diǎn)：

1、識(shí)別heavy task的速度更快。

2、針對(duì)cpu util的計(jì)算快，從而可以更快控制cpu freq上升和下降。

cumulative_runnable_avg_scaled

總結(jié)

以上是生活随笔為你收集整理的WALT（Window Assisted Load Tracking）学习的全部?jī)?nèi)容，希望文章能夠幫你解決所遇到的問(wèn)題。

如果覺(jué)得生活随笔網(wǎng)站內(nèi)容還不錯(cuò)，歡迎將生活随笔推薦給好友。

上一篇：【UEditor】百度编辑器插入vide
下一篇：【plupload】单易用且功能强大的上