當前位置：首頁 > 编程资源 > 编程问答 >内容正文

编程问答

CPU中的DVFS

發布時間：2023/12/16 编程问答 41 豆豆

生活随笔收集整理的這篇文章主要介紹了 CPU中的DVFS 小編覺得挺不錯的,現在分享給大家,幫大家做個參考.

轉載CPU動態調頻二：interactive governor

加入了自己的一些看法，看源碼可以用source insight軟件。

Linux提供了多種governor供用戶選擇，這里以interactive為例，畢竟現在的android手機中都是采用該governor.。基于linux 3.14
以下代碼若未指明位置則默認在drivers/cpufreq/cpufreq_interactive.c中.

首先需要定義一個cpufreq_governor類型的結構體用來描述interactive governor.
?

/*創建一個結構體，保存我們所需要的策略*/ struct cpufreq_governor cpufreq_gov_interactive = {.name = "interactive",.governor = cpufreq_governor_interactive,//策略的主體函數.max_transition_latency = 10000000,.owner = THIS_MODULE, };

看一下cpufreq_governor結構體：

struct cpufreq_governor {char name[CPUFREQ_NAME_LEN];int initialized;int (*governor) (struct cpufreq_policy *policy,unsigned int event);ssize_t (*show_setspeed) (struct cpufreq_policy *policy,char *buf);int (*store_setspeed) (struct cpufreq_policy *policy,unsigned int freq);unsigned int max_transition_latency; /* HW must be able to switch tonext freq faster than this value in nano secs or wewill fallback to performance governor */struct list_head governor_list;struct module *owner; };

name：governor的名字，這里被賦值為interactive
initialized：初始化標志位
max_transition_latency：注釋說的很清楚了，硬件從當前頻率切換到下一個頻率時所用的時間必須比max_transition_latency規定的時間小，否則governor將切換到performance.該數值以納秒為單位.
governor_list：所有注冊的governor都會被add到這個鏈表里面。
governor：這個calback用于控制governor的行為，比較重要，是governor的一個去切入點，后面會詳解.

好了，現在我們靜態的定義了一個interactive governor，在governor工作之前還要做一些初始化工作

static int __init cpufreq_interactive_init(void)//初始化策略 {unsigned int i;struct cpufreq_interactive_cpuinfo *pcpu;struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };/* Initalize per-cpu timers */for_each_possible_cpu(i) {pcpu = &per_cpu(cpuinfo, i);//cpu messageinit_timer_deferrable(&pcpu->cpu_timer);pcpu->cpu_timer.function = cpufreq_interactive_timer;//功能pcpu->cpu_timer.data = i;init_timer(&pcpu->cpu_slack_timer); //初始化定時器pcpu->cpu_slack_timer.function = cpufreq_interactive_nop_timer;spin_lock_init(&pcpu->load_lock);init_rwsem(&pcpu->enable_sem);}spin_lock_init(&speedchange_cpumask_lock);mutex_init(&gov_lock);speedchange_task =kthread_create(cpufreq_interactive_speedchange_task, NULL,"cfinteractive");//創建一個線程用于策略if (IS_ERR(speedchange_task))return PTR_ERR(speedchange_task);sched_setscheduler_nocheck(speedchange_task, SCHED_FIFO, &param);get_task_struct(speedchange_task);/* NB: wake up so the thread does not look hung to the freezer */wake_up_process(speedchange_task); /* 注冊interactive governor，函數在drivers/cpufreq/cpufreq.c中，對策略進行初始化并保存*/return cpufreq_register_governor(&cpufreq_gov_interactive); }

遍歷可能的CPU
get到每個CPU的cpuinfo成員
初始化可延時定時器
設置定時器的function，定時器超時時會調用該函數
設置定時器的data，這里表示CPU ID
初始化slack定時器
設置該定時器的function，定時器超時時會調用該函數
初始化兩個定時器的spin_lock
初始化可讀信號量
創建一個線程cpufreq_interactive_speedchange_task，返回的進程描述符用speedchange_task保存，這個很重要，是改變速度的任務
設置該線程的調度策略和調度參數
該線程的引用計數加1，沒有看出來
喚醒speedchange_task
調用cpufreq_register_governor注冊interactive governor,在drivers/cpufreq/cpufreq.c中

static LIST_HEAD(cpufreq_governor_list);int cpufreq_register_governor(struct cpufreq_governor *governor) {int err;if (!governor)return -EINVAL;if (cpufreq_disabled())return -ENODEV;mutex_lock(&cpufreq_governor_mutex);governor->initialized = 0;err = -EBUSY;if (__find_governor(governor->name) == NULL) {err = 0;list_add(&governor->governor_list, &cpufreq_governor_list);}mutex_unlock(&cpufreq_governor_mutex);return err; } EXPORT_SYMBOL_GPL(cpufreq_register_governor);

pufreq_governor_list用來保存已注冊的governor
__find_governor會在cpufreq_governor_list中遍歷尋找是否有與需要register的governor重名的governor，如果沒有則將該governor添加到cpufreq_governor_list中
好的，簡單介紹了一下governor的定義，初始化，注冊。
現在我們已經擁有了一個interactive governor，CPUFREQ core如果想操作governor進行選頻，那么interactive governor必須對外提供一個interface以供調用，這就是cpufreq_governor結構體中的governor callback，下面來以這個interface為切入點分析governor是如何工作的.
?

The governor->governor callback is called with the current (or to-be-set) cpufreq_policy struct for that CPU, and an unsigned int event. The following events are currently defined:CPUFREQ_GOV_START: This governor shall start its duty for the CPUpolicy->cpu CPUFREQ_GOV_STOP: This governor shall end its duty for the CPUpolicy->cpu CPUFREQ_GOV_LIMITS: The limits for CPU policy->cpu have changed topolicy->min and policy->max.

在前面的定義中有如下，這就是函數的入口

.governor = cpufreq_governor_interactive,

?下面來看一下cpufreq_governor_interactive,分段分析：

static int cpufreq_governor_interactive(struct cpufreq_policy *policy,unsigned int event) {int rc;unsigned int j;struct cpufreq_interactive_cpuinfo *pcpu;struct cpufreq_frequency_table *freq_table;//頻率表格struct cpufreq_interactive_tunables *tunables;//策略的參數

定義了一堆變量：pcpu描述了cpu相關信息，結構體如下，用到的時候在看。

struct cpufreq_interactive_cpuinfo {struct timer_list cpu_timer;struct timer_list cpu_slack_timer;spinlock_t load_lock; /* protects the next 4 fields */u64 time_in_idle;u64 time_in_idle_timestamp;u64 cputime_speedadj;u64 cputime_speedadj_timestamp;struct cpufreq_policy *policy;struct cpufreq_frequency_table *freq_table;spinlock_t target_freq_lock; /*protects target freq */unsigned int target_freq;unsigned int floor_freq;unsigned int max_freq;u64 floor_validate_time;u64 hispeed_validate_time;struct rw_semaphore enable_sem;int governor_enabled; };

freq_tab表示頻率表，結構體如下，你會發現這是一個node，每個node代表一個頻點，很多node關聯在一起就成了一個tab：

struct cpufreq_frequency_table {unsigned int driver_data; /* driver specific data, not used by core */unsigned int frequency; /* kHz - doesn't need to be in ascending* order */ };

?struct cpufreq_interactive_tunables *tunables;這個結構體很重要，貫穿了整個governor callback，先給出結構體，接下來在函數中邊看邊分析。可以看出來，這策略的一些參數的定義

struct cpufreq_interactive_tunables {int usage_count;/* Hi speed to bump to from lo speed when load burst (default max) */unsigned int hispeed_freq;/* Go to hi speed when CPU load at or above this value. */#define DEFAULT_GO_HISPEED_LOAD 99unsigned long go_hispeed_load;/* Target load. Lower values result in higher CPU speeds. */spinlock_t target_loads_lock;unsigned int *target_loads;int ntarget_loads;/** The minimum amount of time to spend at a frequency before we can ramp* down.*/#define DEFAULT_MIN_SAMPLE_TIME (80 * USEC_PER_MSEC)unsigned long min_sample_time;/** The sample rate of the timer used to increase frequency*/unsigned long timer_rate;/** Wait this long before raising speed above hispeed, by default a* single timer interval.*/spinlock_t above_hispeed_delay_lock;unsigned int *above_hispeed_delay;int nabove_hispeed_delay;/* Non-zero means indefinite speed boost active */int boost_val;/* Duration of a boot pulse in usecs */int boostpulse_duration_val;/* End time of boost pulse in ktime converted to usecs */u64 boostpulse_endtime;bool boosted;/** Max additional time to wait in idle, beyond timer_rate, at speeds* above minimum before wakeup to reduce speed, or -1 if unnecessary.*/#define DEFAULT_TIMER_SLACK (4 * DEFAULT_TIMER_RATE)int timer_slack_val;bool io_is_busy; };

?回到cpufreq_governor_interactive函數，繼續往下看

if (have_governor_per_policy()) //判斷是否每個cpu都有策略tunables = policy->governor_data;elsetunables = common_tunables; //只有一個策略的（一個策略控制所有核）

have_governor_per_policy判斷是否每個policy都有自己的governor，當沒有設置have_governor_per_policy時，表示所有的policy使用了同一種governor，該字段指向該governor的dbs_data結構。

/* For cases where we have single governor instance for system */ static struct cpufreq_interactive_tunables *common_tunables;

但是沒有分配內存和初始化。繼續往下看。

switch (event) {

判斷event的類型，并根據event進行不同的操作。
在include/linux/cpufreq.h中定義了幾種Governor Events

/* Governor Events */ #define CPUFREQ_GOV_START 1 #define CPUFREQ_GOV_STOP 2 #define CPUFREQ_GOV_LIMITS 3 #define CPUFREQ_GOV_POLICY_INIT 4 #define CPUFREQ_GOV_POLICY_EXIT 5

第一個是：CPUFREQ_GOV_POLICY_INIT

case CPUFREQ_GOV_POLICY_INIT://初始化if (have_governor_per_policy()) {WARN_ON(tunables);} else if (tunables) {//這個好像是判斷tunables有沒有分配內存tunables->usage_count++;policy->governor_data = tunables;return 0;}tunables = kzalloc(sizeof(*tunables), GFP_KERNEL);//分配內存if (!tunables) {pr_err("%s: POLICY_INIT: kzalloc failed\n", __func__);return -ENOMEM;}/*然后初始化*/tunables->usage_count = 1;tunables->above_hispeed_delay = default_above_hispeed_delay; //高速度向更高情況下的延時tunables->nabove_hispeed_delay =ARRAY_SIZE(default_above_hispeed_delay);tunables->go_hispeed_load = DEFAULT_GO_HISPEED_LOAD;//負載閾值tunables->target_loads = default_target_loads;//目標負載tunables->ntarget_loads = ARRAY_SIZE(default_target_loads);tunables->min_sample_time = DEFAULT_MIN_SAMPLE_TIME;//最小采樣時間tunables->timer_rate = DEFAULT_TIMER_RATE;//采樣率tunables->boostpulse_duration_val = DEFAULT_MIN_SAMPLE_TIME;//超頻最少持續時間？tunables->timer_slack_val = DEFAULT_TIMER_SLACK;//空閑時間的采樣率spin_lock_init(&tunables->target_loads_lock);spin_lock_init(&tunables->above_hispeed_delay_lock);policy->governor_data = tunables; //？？if (!have_governor_per_policy())common_tunables = tunables;rc = sysfs_create_group(get_governor_parent_kobj(policy),get_sysfs_attr());//attribute_group??if (rc) {kfree(tunables);policy->governor_data = NULL;if (!have_governor_per_policy())common_tunables = NULL;return rc;}if (!policy->governor->initialized) {idle_notifier_register(&cpufreq_interactive_idle_nb);cpufreq_register_notifier(&cpufreq_notifier_block,CPUFREQ_TRANSITION_NOTIFIER);}break;

該event表示要init governor policy.
首先判斷have_governor_per_policy()，前面分析過了，返回false，并且tunables并沒有被分配內存，所以執行下一條語句

tunables = kzalloc(sizeof(*tunables), GFP_KERNEL);

?終于為tunables分配內存了~~
接下來就是對tunables的初始化：

tunables->usage_count = 1;tunables->above_hispeed_delay = default_above_hispeed_delay;tunables->nabove_hispeed_delay =ARRAY_SIZE(default_above_hispeed_delay);tunables->go_hispeed_load = DEFAULT_GO_HISPEED_LOAD;tunables->target_loads = default_target_loads;tunables->ntarget_loads = ARRAY_SIZE(default_target_loads);tunables->min_sample_time = DEFAULT_MIN_SAMPLE_TIME;tunables->timer_rate = DEFAULT_TIMER_RATE;tunables->boostpulse_duration_val = DEFAULT_MIN_SAMPLE_TIME;tunables->timer_slack_val = DEFAULT_TIMER_SLACK;spin_lock_init(&tunables->target_loads_lock);spin_lock_init(&tunables->above_hispeed_delay_lock);

?usage_count表示引用計數，初始化的時候設置為1

above_hispeed_delay，內核文檔： /Documention/cpu-freq/governors.txt

above_hispeed_delay: When speed is at or above hispeed_freq, wait for this long before raising speed in response to continued high load. The format is a single delay value, optionally followed by pairs of CPU speeds and the delay to use at or above those speeds. Colons can be used between the speeds and associated delays for readability. For example:80000 1300000:200000 1500000:40000uses delay 80000 uS until CPU speed 1.3 GHz, at which speed delay 200000 uS is used until speed 1.5 GHz, at which speed (and above) delay 40000 uS is used. If speeds are specified these must appear in ascending order. Default is 20000 uS.

下面是我自己的理解，有錯誤請務必指出：當CPU頻率大于等于hispeed_freq，并且此時workload仍在不停增加（continued high load），系統將等待一個above_hispeed_delay的時間。above_hispeed_delay一般是這樣一種格式，一個單個的延時數值，后面跟上一組由CPU speeds 和 delay組成的數組，由冒號隔開。例如：
80000 1300000:200000 1500000:40000
當頻率低于1.3G時，above_hispeed_delay的值取80000，1.3G到1.5G之間取20000，大于1.5G取40000.默認取20000us.如果頻率被指定，那么這些數值必須必須是升序的。（最后一句不是很確定很理解）

#define DEFAULT_TIMER_RATE (20 * USEC_PER_MSEC) #define DEFAULT_ABOVE_HISPEED_DELAY DEFAULT_TIMER_RATE static unsigned int default_above_hispeed_delay[] = {DEFAULT_ABOVE_HISPEED_DELAY };

可以看到default_above_hispeed_delay是一個數組，我的環境下只有一個數值，above_hispeed_delay的數值就是ta了。

nabove_hispeed_delay:default_above_hispeed_delays數組中元素的個數。

go_hispeed_load: The CPU load at which to ramp to hispeed_freq.Default is 99%。高頻閾值。當系統的負載超過該值，升頻，否則降頻。

#define DEFAULT_GO_HISPEED_LOAD 99

調頻的時候會用到這個數值，后面會說到.

順便說一下hispeed_freq: Hi speed to bump to from lo speed when load burst (default max)。當workload達到 go_hispeed_load時，頻率將被拉高到這個值，默認的大小由policy來決定。
target_loads:

CPU load values used to adjust speed to influence the current CPU load toward that value. In general, the lower the target load, the more often the governor will raise CPU speeds to bring load below the target. The format is a single target load, optionally followed by pairs of CPU speeds and CPU loads to target at or above those speeds. Colons can be used between the speeds and associated target loads for readability. For example:85 1000000:90 1700000:99targets CPU load 85% below speed 1GHz, 90% at or above 1GHz, until 1.7GHz and above, at which load 99% is targeted. If speeds are specified these must appear in ascending order. Higher target load values are typically specified for higher speeds, that is, target load values also usually appear in an ascending order. The default is target load 90% for all speeds.

target_loads使得CPU調整頻率來影響當前的CPU workload，促使當前的CPU workload向target_loads靠近。通常，target_loads的值越小，CPU就會越頻繁地拉高頻率使當前workload低于target_loads。例如：頻率小于1G時，取85%；1G—-1.7G，取90%；大于1.7G，取99%。默認值取90%。

/* Target load. Lower values result in higher CPU speeds. */ #define DEFAULT_TARGET_LOAD 90 static unsigned int default_target_loads[] = {DEFAULT_TARGET_LOAD};

ntarget_loads：target_loads的個數

min_sample_time：最小采樣時間，剛好80000us.

#define DEFAULT_MIN_SAMPLE_TIME (80 * USEC_PER_MSEC) /include/linux/time.h #define USEC_PER_MSEC 1000L

boostpulse_duration_val：

boost: If non-zero, immediately boost speed of all CPUs to at least hispeed_freq until zero is written to this attribute. If zero, allow CPU speeds to drop below hispeed_freq according to load as usual. Default is zero.boostpulse: On each write, immediately boost speed of all CPUs to hispeed_freq for at least the period of time specified by boostpulse_duration, after which speeds are allowed to drop below hispeed_freq according to load as usual.boostpulse_duration: Length of time to hold CPU speed at hispeed_freq on a write to boostpulse, before allowing speed to drop according to load as usual. Default is 80000 uS.

boost：即超頻，在linux中我們可以操作方法是，但內核中不知道怎么弄

echo 1 > /sys/devices/system/cpu/cpufreq/interactive/boost

此時會立即將所有CPU的頻率提高到至少hispeed_freq.寫入0時，根據workload降低頻率.默認為0.

boostpulse，每次觸發boost功能時，立即拉高所有CPU的頻率到hispeed_freq并保持在該頻率至少boostpulse_duration的時間，在這段時間以后，根據當前的workload，頻率才允許被降低。

boostpulse_duration：默認值80000 uS.這里我的值也是80000 uS.

timer_rate和timer_slack_val：當CPU不處于idel狀態時，timer_rate作為采樣速率來計算CPU的workload.
當CPU處于idel狀態，此時使用一個可延時定時器，會導致CPU不能從idel狀態蘇醒來響應定時器.
定時器的最大的可延時時間用timer_slack表示，默認值80000 uS.

timer_rate: Sample rate for reevaluating CPU load when the CPU is not idle. A deferrable timer is used, such that the CPU will not be woken from idle to service this timer until something else needs to run. (The maximum time to allow deferring this timer when not running at minimum speed is configurable via timer_slack.) Default is 20000 uS.timer_slack: Maximum additional time to defer handling the governor sampling timer beyond timer_rate when running at speeds above the minimum. For platforms that consume additional power at idle when CPUs are running at speeds greater than minimum, this places an upper bound on how long the timer will be deferred prior to re-evaluating load and dropping speed. For example, if timer_rate is 20000uS and timer_slack is 10000uS then timers will be deferred for up to 30msec when not at lowest speed. A value of -1 means defer timers indefinitely at all speeds. Default is 80000 uS.

繼續向下看

spin_lock_init(&tunables->target_loads_lock);spin_lock_init(&tunables->above_hispeed_delay_lock);policy->governor_data = tunables; //？？if (!have_governor_per_policy())common_tunables = tunables;rc = sysfs_create_group(get_governor_parent_kobj(policy),get_sysfs_attr());//attribute_group??if (rc) {kfree(tunables);policy->governor_data = NULL;if (!have_governor_per_policy())common_tunables = NULL;return rc;}if (!policy->governor->initialized) {idle_notifier_register(&cpufreq_interactive_idle_nb);cpufreq_register_notifier(&cpufreq_notifier_block,CPUFREQ_TRANSITION_NOTIFIER);}break;

初始化tunables結構體中的兩個自旋鎖。
將tunables指針賦值給policy->governor_data
將tunables指針賦值給common_tunables，這個全局變量會在一些文件的show和store函數中被調用，沒有深入研究.
繼續

rc = sysfs_create_group(get_governor_parent_kobj(policy),get_sysfs_attr());

看一下get_governor_parent_kobj和get_sysfs_attr，在drivers/cpufreq/cpufreq.c中

struct kobject *get_governor_parent_kobj(struct cpufreq_policy *policy) {if (have_governor_per_policy())return &policy->kobj;elsereturn cpufreq_global_kobject; } EXPORT_SYMBOL_GPL(get_governor_parent_kobj);struct kobject *cpufreq_global_kobject; EXPORT_SYMBOL(cpufreq_global_kobject); static struct attribute_group *get_sysfs_attr(void) {if (have_governor_per_policy())return &interactive_attr_group_gov_pol;elsereturn &interactive_attr_group_gov_sys; }static struct attribute_group interactive_attr_group_gov_sys = {.attrs = interactive_attributes_gov_sys,.name = "interactive", };/* One Governor instance for entire system */ static struct attribute *interactive_attributes_gov_sys[] = {&target_loads_gov_sys.attr,&above_hispeed_delay_gov_sys.attr,&hispeed_freq_gov_sys.attr,&go_hispeed_load_gov_sys.attr,&min_sample_time_gov_sys.attr,&timer_rate_gov_sys.attr,&timer_slack_gov_sys.attr,&boost_gov_sys.attr,&boostpulse_gov_sys.attr,&boostpulse_duration_gov_sys.attr,&io_is_busy_gov_sys.attr,NULL, };

OK，我們把上述代碼簡化一下得到

rc = sysfs_create_group(cpufreq_global_kobject,interactive_attr_group_gov_sys);

在cpufreq_global_kobject所對應的目錄cpufreq下創建一個名為interactive的目錄，并創建與之關聯的屬性文件。通過以下方式可以看到這些屬性文件

ls /sys/devices/system/cpu/cpufreq/interactive/

最后注冊了兩個notification，分別是idle相關和頻率改變相關.
回顧一下CPUFREQ_GOV_POLICY_INIT都做了什么：
1.定義并初始化了一個cpufreq_interactive_tunables結構體，將該結構體指針賦值給policy->governor_data，在struct cpufreq_policy結構體中，policy->governor_data為void *指針，現在我們知道它的作用是指向tunables，而tunablesa對應的內存中存放了governor調節頻率的參數，這就是policy->governor_data的作用.
2.創建對應的目錄和屬性文件

第二個是：CPUFREQ_GOV_POLICY_EXIT

這個event對應的操作比較簡單一些，主要是做一些policy和governor的“善后”工作，不必贅述了。

case CPUFREQ_GOV_POLICY_EXIT://退出if (!--tunables->usage_count) {//對每一個都釋放注冊if (policy->governor->initialized == 1) {cpufreq_unregister_notifier(&cpufreq_notifier_block,CPUFREQ_TRANSITION_NOTIFIER);idle_notifier_unregister(&cpufreq_interactive_idle_nb);}sysfs_remove_group(get_governor_parent_kobj(policy),get_sysfs_attr());kfree(tunables);//釋放內存common_tunables = NULL;//釋放內存}policy->governor_data = NULL;break;

第三個是：CPUFREQ_GOV_START

啟動一個governor

case CPUFREQ_GOV_START://開始工作mutex_lock(&gov_lock); //鎖住進程freq_table = cpufreq_frequency_get_table(policy->cpu);//得到CPU頻率表if (!tunables->hispeed_freq)//高頻的頻率tunables->hispeed_freq = policy->max;for_each_cpu(j, policy->cpus) {//遍歷CPU進行設置？？pcpu = &per_cpu(cpuinfo, j);pcpu->policy = policy;pcpu->target_freq = policy->cur;pcpu->freq_table = freq_table;pcpu->floor_freq = pcpu->target_freq;pcpu->floor_validate_time =ktime_to_us(ktime_get());pcpu->hispeed_validate_time =pcpu->floor_validate_time;down_write(&pcpu->enable_sem);del_timer_sync(&pcpu->cpu_timer);del_timer_sync(&pcpu->cpu_slack_timer);cpufreq_interactive_timer_start(tunables, j);//開啟定時器，賊重要pcpu->governor_enabled = 1; //開啟up_write(&pcpu->enable_sem);}mutex_unlock(&gov_lock);break;

首先獲取freq_tab.
如果沒有設置hispeed_freq的值的話，就設置hispeed_freq為policy->max，和之前介紹hispeed_freq時說的一樣.
接下來是一個for循環，policy->cpus表示所有處于online狀態的CPU，for循環遍歷所有處于online狀態的CPU，在這個循環中：
get到cpu的cpuinfo結構體并把指針賦值給pcpu，一個struct cpufreq_interactive_cpuinfo結構體指針.
然后對pcpu的一些成員進行初始化，本質上還是設置online cpus的cpuinfo結構體成員.
然后調用cpufreq_interactive_timer_start啟動相關的定時器
啟動定時器以后governor就可以工作了，所以設置pcpu->governor_enabled為1
來看代碼：

static void cpufreq_interactive_timer_start(//還沒細看struct cpufreq_interactive_tunables *tunables, int cpu) {struct cpufreq_interactive_cpuinfo *pcpu = &per_cpu(cpuinfo, cpu);//讀取CPU狀態信息unsigned long expires = jiffies +usecs_to_jiffies(tunables->timer_rate);unsigned long flags;pcpu->cpu_timer.expires = expires;add_timer_on(&pcpu->cpu_timer, cpu);if (tunables->timer_slack_val >= 0 &&pcpu->target_freq > pcpu->policy->min) {expires += usecs_to_jiffies(tunables->timer_slack_val);pcpu->cpu_slack_timer.expires = expires;add_timer_on(&pcpu->cpu_slack_timer, cpu);}spin_lock_irqsave(&pcpu->load_lock, flags);pcpu->time_in_idle =get_cpu_idle_time(cpu, &pcpu->time_in_idle_timestamp,tunables->io_is_busy);pcpu->cputime_speedadj = 0;pcpu->cputime_speedadj_timestamp = pcpu->time_in_idle_timestamp;spin_unlock_irqrestore(&pcpu->load_lock, flags); }

注釋中有解釋：The cpu_timer and cpu_slack_timer must be deactivated when calling this function.
所以在進入cpufreq_interactive_timer_start之前有一些deactive的操作：
?

del_timer_sync(&pcpu->cpu_timer); del_timer_sync(&pcpu->cpu_slack_timer);

看看cpufreq_interactive_timer_start究竟做了什么
設置定時器的到期時間expire
調用add_timer_on添加定時器，”start a timer on a particular CPU”
在指定的CPU上start一個定時器，假如我的手機上有4個CPU，那么將有四個定時器被添加到pcpu->cpu_timer鏈表中
cpu_slack_timer也是同樣的操作
然后獲取該CPU的idle時間，這個數值在統計更新時間的時候會被用到.

pcpu->time_in_idle = get_cpu_idle_time(cpu, &pcpu->time_in_idle_timestamp,tunables->io_is_busy);

隨后就是

pcpu->cputime_speedadj = 0;pcpu->cputime_speedadj_timestamp = pcpu->time_in_idle_timestamp;

time_in_idle_timestamp的數值在get_cpu_idle_time函數中被更新，在代碼中形參的名字為last_update_time，可以理解為更新time_in_idle的時間戳。網上有人解釋為計算機啟動到現在的時間，是一樣的.

OK，到這里start governor的工作就完成了，主要就是啟動了兩個定時器，定時器到期的話，會執行相關的操作最終選定要set的頻率.
本來到這里我們應該回到cpufreq_governor_interactive中分析event為CPUFREQ_GOV_LIMITS的情況。
但是為了思路的流暢性，我們順著定時器繼續追代碼，看定時器如何實現選頻.

__init cpufreq_interactive_init函數中

cpu->cpu_timer.function = cpufreq_interactive_timer;pcpu->cpu_timer.data = i;

定時器到期時，調用 cpufreq_interactive_timer，
這里data是cpu的索引號，在cpufreq_interactive_init中cpu_timer的data成員被賦值成為CPU的索引號，之后調用cpu_timer.function的時候作為實參。最主要的
分段看：

static void cpufreq_interactive_timer(unsigned long data) {u64 now;unsigned int delta_time;u64 cputime_speedadj;int cpu_load;struct cpufreq_interactive_cpuinfo *pcpu =&per_cpu(cpuinfo, data); //得到信息struct cpufreq_interactive_tunables *tunables =pcpu->policy->governor_data;//得到信息unsigned int new_freq;unsigned int loadadjfreq;unsigned int index;unsigned long flags;bool boosted;if (!down_read_trylock(&pcpu->enable_sem))return;if (!pcpu->governor_enabled)goto exit;spin_lock_irqsave(&pcpu->load_lock, flags);now = update_load(data);//更新負載，總時間delta_time = (unsigned int)(now - pcpu->cputime_speedadj_timestamp);//兩次統計之間的時間cputime_speedadj = pcpu->cputime_speedadj;//頻率的積分總和？？spin_unlock_irqrestore(&pcpu->load_lock, flags);if (WARN_ON_ONCE(!delta_time))goto rearm;do_div(cputime_speedadj, delta_time);//loadadjfreq(類似于積分)=（運行時間 -空閑時間）* policy->cur / 運行時間 * 100loadadjfreq = (unsigned int)cputime_speedadj * 100;//cpu_load=非idle的時間比例*當前頻率占目標頻率的比例cpu_load = loadadjfreq / pcpu->target_freq;//超頻是否允許boosted = tunables->boost_val || now < tunables->boostpulse_endtime;

首先調用update_load，更新工作負載

static u64 update_load(int cpu) {struct cpufreq_interactive_cpuinfo *pcpu = &per_cpu(cpuinfo, cpu);struct cpufreq_interactive_tunables *tunables =pcpu->policy->governor_data;u64 now;//現在的時間？？u64 now_idle;unsigned int delta_idle;unsigned int delta_time;u64 active_time;now_idle = get_cpu_idle_time(cpu, &now, tunables->io_is_busy);//總空閑時間delta_idle = (unsigned int)(now_idle - pcpu->time_in_idle);//兩次空閑時間之間的時間delta_time = (unsigned int)(now - pcpu->time_in_idle_timestamp);//兩次統計之間系統運行的總時間？？if (delta_time <= delta_idle)active_time = 0;//說明運行期間CPU一直在idle，active_time賦值為0 elseactive_time = delta_time - delta_idle;//處于運行狀態的時間pcpu->cputime_speedadj += active_time * pcpu->policy->cur;//計算頻率的積分？？限制的一種？？pcpu->time_in_idle = now_idle;//總空閑時間pcpu->time_in_idle_timestamp = now;return now; }

now_idle:系統啟動以后運行的idle的總時間

the cummulative idle time (since boot) for a given CPU, in microseconds.

pcpu->time_in_idle：上次統計時的idle的總時間
delta_idle：兩次統計之間的idle總時間

now：本次計時的時間，本次的update time

variable to store update time in.

pcpu->time_in_idle_timestamp，上次統計idle時的時間戳
delta_time：兩次統計之間系統運行的總時間

若delta_time <= delta_idle，說明運行期間CPU一直在idle，active_time賦值為0.
否則，active_time = delta_time - delta_idle;計算出兩次統計之間CPU處于active的總時間.

然后更新pcpu的一些成員變量的值:
pcpu->cputime_speedadj 這個數值的計算方式是本身加上active_time * pcpu->policy->cur，是一共改變了多少頻率的意思？不確定.
pcpu->time_in_idle = now_idle; 更新系統啟動后運行的總idle時間.
pcpu->time_in_idle_timestamp = now;更新統計時的時間戳.
上面這兩個數值被更新留作下次update_load使用
回到cpufreq_interactive_timer，update_load返回了最新一次統計idle時的時間戳，賦值給now.

delta_time = (unsigned int)(now - pcpu->cputime_speedadj_timestamp);

再次計算兩次統計之間的運行時間
在update_load中是now - pcpu->time_in_idle_timestamp，但是隨后在update_load更新了time_in_idle_timestamp的值，所以now和time_in_idle_timestamp應該相等，不能再這么算.
這里用cputime_speedadj_timestamp，我在函數cpufreq_interactive_timer_resched和cpufreq_interactive_timer_start發現cputime_speedadj_timestamp都被賦值為time_in_idle_timestamp，所以我認為：cputime_speedadj_timestamp是作為time_in_idle_timestamp的一個“備份”，保存上次統計時的time_in_idle_timestamp.

然后取pcpu->cputime_speedadj賦值給局部變量cputime_speedadj，cpu->cputime_speedadj在update_load中已被計算并更新過了.

接下來的幾行代碼都是用來計算cpu_load，把這些數值展開看就變得很清晰了

loadadjfreq = (unsigned int)cputime_speedadj * 100;

替換后

cputime_speedadj = active_time * policy->cur * 100 cputime_speedadj = （delta_time - delta_idle）* policy->cur * 100 cputime_speedadj = [(now -time_in_idle_timestamp) - (now_idle - time_in_idle)] * policy->cur * 100

now -time_in_idle_timestamp是兩次統計間的運行時間，用x表示；
now_idle - time_in_idle是兩次統計間CPU處于idle的時間，用y表示。
之前分析過，cputime_speedadj_timestamp是time_in_idle_timestamp的備份，且
?

do_div(cputime_speedadj, delta_time);

所以，可以替換為

cputime_speedadj = cputime_speedadj / delta_time delta_time = now - pcpu->cputime_speedadj_timestamp delta_time = now - time_in_idle_timestamp

now -time_in_idle_timestamp是兩次統計間的運行時間，用x表示；
now_idle - time_in_idle是兩次統計間idle的總時間，用y表示
所以

cputime_speedadj = （x - y）* policy->cur / x * 100 cpu_load = [（x - y） / x ] * [ policy->cur / pcpu->target_freq] * 100 cpu_load = (1 - x / y) * ( policy->cur / pcpu->target_freq ) * 100

(1 - x / y)是統計時間內CPU處于非idle的時間比例，policy->cur / pcpu->target_freq 表示當前頻率占目標頻率的比例，至于為什么要乘以100，是因為內核不支持浮點運算.

ok，到這里我們終于發現，影響cpu_load的兩個因素
1. idle時間
2. 當前頻率/目標頻率

有一個疑問：
cpufreq_interactive_timer函數的目的是為了根據當前的workload選頻，得到目標頻率，然后傳給cpufreq driver來設置頻率。如果已經有了目標頻率，那么直接調driver設置好了，所以這里的pcpu->target_freq不是本次選頻得到的target_freq
在cpufreq_interactive_timer的后面代碼中，我們看到
?

pcpu->target_freq = new_freq;

new_freq 是本次選頻后得到的新頻率，最后賦值給pcpu->target_freq，所以在cpufreq_interactive_timer中，該賦值語句之前的所有pcpu->target_freq都表示是上一次選頻的target_freq

所以更正一下，影響cpu_load的兩個因素

1. idle時間的頻率
2. 當前頻率/上一次選頻頻率

OK，帶著這個思路就比較好分析了

if (cpu_load >= tunables->go_hispeed_load || tunables->boosted) {if (pcpu->target_freq < tunables->hispeed_freq) {new_freq = tunables->hispeed_freq;} else {new_freq = choose_freq(pcpu, loadadjfreq);if (new_freq < tunables->hispeed_freq)new_freq = tunables->hispeed_freq;}} else {new_freq = choose_freq(pcpu, loadadjfreq);if (new_freq > tunables->hispeed_freq &&pcpu->target_freq < tunables->hispeed_freq)new_freq = tunables->hispeed_freq;}

當cpu_load大于tunables->go_hispeed_load或者tunables->boosted的值為非0，此時我們需要拉高頻率.
如果上一次選頻頻率比tunables->hispeed_freq小，那么直接設置new_freq為tunables->hispeed_freq;
如果上一次選頻頻率不小于tunables->hispeed_freq，調用choose_freq函數選頻，若選頻后仍然達不到tunables->hispeed_freq，那么直接設置new_freq為tunables->hispeed_freq。
可以看到，tunables->go_hispeed_load時，new_freq的頻率要不小于tunables->hispeed_freq.

當cpu_load小于等于tunables->go_hispeed_load并且tunables->boosted的值為0，調用choose_freq選頻.
若選頻后new_freq的值大于tunables->hispeed_freq并且上一次選頻頻率小于tunables->hispeed_freq，那么直接設置new_freq為tunables->hispeed_freq.

關于choose_freq是如何選頻的，請看函數

freq的初始數值是pcpu->policy->cur
prevfreq保存著上次的freq，freq表示本次選頻的結果，當二者相等時，就表示已經達到了最佳的選頻結果.

順便說一下：

#define CPUFREQ_RELATION_L 0 /* lowest frequency at or above target CPUFREQ_RELATION_L，表示要取大于等于target的最小值*/ #define CPUFREQ_RELATION_H 1 /* highest frequency below or at target CPUFREQ_RELATION_H，表示要取小于等于target的最大值*/

之后是一個循環：
把上次的freq賦值給prevfreq
通過freq_to_targetload得到target load——tl
然后調用cpufreq_frequency_table_target，取大于等于loadadjfreq / tl，即target freq的最小值.
loadadjfreq是兩次采樣統計間的平均頻率，除以target load就得到target freq.

freq = pcpu->freq_table[index].frequency;

我覺得情景應該是這樣的，剛開始freq = pcpu->policy->cur，備份到prevfreq，調用cpufreq_frequency_table_target得到新的freq，然后執行下面的if判斷，在這個判斷中可能會調整freq，先不管這些.
到了下一次循環，上一次的freq又被備份到prevfreq，然后又調用cpufreq_frequency_table_target得到新的freq，如此往復循環，prevfreq和freq的數值會越來越接近，直到相等，就完成了選頻.
總體思路是這樣，那么來看if判斷做了什么.
拿freq和prevfreq比較：
若freq > prevfreq，則
freqmin = prevfreq;
否則
freqmax = prevfreq;

如果freq > prevfreq，說明比上次大，但是不能比之前的記錄最大值大，否則調節就沒有意義了，所以
如果freq >= freqmax，那么調用cpufreq_frequency_table_target，找小于freqmax的最近一個頻點，如果該頻點正好是最小頻點，說明只有freqmax可以用了，直接break；

如果freq < prevfreq，說明比上次小，但是不能比之前記錄的最小值小，否則調節就沒有意義了，所以
如果freq <= freqmax，那么調用cpufreq_frequency_table_target，找大于freqmax的最近一個頻點，如果該頻點正好是最大頻點，直接break，；
最后返回選好的頻點freq.

繼續探究cpufreq_interactive_timer

if (pcpu->target_freq >= tunables->hispeed_freq &&new_freq > pcpu->target_freq &&now - pcpu->hispeed_validate_time <freq_to_above_hispeed_delay(tunables, pcpu->target_freq)) {trace_cpufreq_interactive_notyet(data, cpu_load, pcpu->target_freq,pcpu->policy->cur, new_freq);spin_unlock_irqrestore(&pcpu->target_freq_lock, flags);goto rearm;}

freq_to_above_hispeed_delay，只是返回了tunables->above_hispeed_delay[i]的數值，我們只設置了一個數值default_above_hispeed_delay.
重點是這個成員的含義，可以回頭看一下INIT階段的解釋.
如果滿足
?

pcpu->target_freq >= tunables->hispeed_freq &&new_freq > pcpu->target_freq

now是本次采樣時間戳，pcpu->hispeed_validate_time是上次hispeed生效的時間戳，如果兩次時間間隔比above_hispeed_delay小，那么直接goto rearm，不調節頻率.

pcpu->hispeed_validate_time = now;

更新hispeed_validate_time為now

if (cpufreq_frequency_table_target(pcpu->policy, pcpu->freq_table,new_freq, CPUFREQ_RELATION_L,&index)) {spin_unlock_irqrestore(&pcpu->target_freq_lock, flags);goto rearm;}new_freq = pcpu->freq_table[index].frequency;

取freq table中大于或等于new_freq的最小頻率，返回index，再由index得到new freq，前面已經得到new freq了，這里為什么要再來一次，不是很理解.

/** Do not scale below floor_freq unless we have been at or above the* floor frequency for the minimum sample time since last validated.*/if (new_freq < pcpu->floor_freq) {if (now - pcpu->floor_validate_time <tunables->min_sample_time) {trace_cpufreq_interactive_notyet(data, cpu_load, pcpu->target_freq,pcpu->policy->cur, new_freq);spin_unlock_irqrestore(&pcpu->target_freq_lock, flags);goto rearm;}}

當new_freq < pcpu->floor_freq，并且兩次floor_validate_time的間隔小于min_sample_time，此時不需要更新頻率.網上有大神說，“在最小抽樣周期間隔內，CPU的頻率是不會變化的.”

/** Update the timestamp for checking whether speed has been held at* or above the selected frequency for a minimum of min_sample_time,* if not boosted to hispeed_freq. If boosted to hispeed_freq then we* allow the speed to drop as soon as the boostpulse duration expires* (or the indefinite boost is turned off).*/if (!tunables->boosted || new_freq > tunables->hispeed_freq) {pcpu->floor_freq = new_freq;pcpu->floor_validate_time = now;}

做一些更新數據的工作

if (pcpu->target_freq == new_freq &&pcpu->target_freq <= pcpu->policy->cur) {trace_cpufreq_interactive_already(data, cpu_load, pcpu->target_freq,pcpu->policy->cur, new_freq);spin_unlock_irqrestore(&pcpu->target_freq_lock, flags);goto rearm_if_notmax;} rearm_if_notmax:/** Already set max speed and don't see a need to change that,* wait until next idle to re-evaluate, don't need timer.*/if (pcpu->target_freq == pcpu->policy->max)goto exit;

如果兩次選頻頻率一樣并且上一次選頻頻率不大于當前頻率，那么進入rearm_if_notmax判斷是否pcpu->target_freq == pcpu->policy->max，如果相等，那么直接退出，不需要調頻，當前頻率已經處于max speed

pcpu->target_freq = new_freq;spin_unlock_irqrestore(&pcpu->target_freq_lock, flags);spin_lock_irqsave(&speedchange_cpumask_lock, flags);cpumask_set_cpu(data, &speedchange_cpumask);spin_unlock_irqrestore(&speedchange_cpumask_lock, flags);wake_up_process(speedchange_task);

將new_freq賦值給target_freq，更新目標頻率的數值.
設置需要調節頻率的CPUcore的cpumask
喚醒speedchange_task線程，改變CPU頻率
speedchange_task被定義在

/* realtime thread handles frequency scaling */ static struct task_struct *speedchange_task;

對應的線程是

speedchange_task =kthread_create(cpufreq_interactive_speedchange_task, NULL,"cfinteractive");

static int cpufreq_interactive_speedchange_task(void *data) {unsigned int cpu;cpumask_t tmp_mask;unsigned long flags;struct cpufreq_interactive_cpuinfo *pcpu;while (1) {set_current_state(TASK_INTERRUPTIBLE);spin_lock_irqsave(&speedchange_cpumask_lock, flags);if (cpumask_empty(&speedchange_cpumask)) {spin_unlock_irqrestore(&speedchange_cpumask_lock,flags);schedule();if (kthread_should_stop())break;spin_lock_irqsave(&speedchange_cpumask_lock, flags);}set_current_state(TASK_RUNNING);tmp_mask = speedchange_cpumask;cpumask_clear(&speedchange_cpumask);spin_unlock_irqrestore(&speedchange_cpumask_lock, flags);for_each_cpu(cpu, &tmp_mask) {unsigned int j;unsigned int max_freq = 0;pcpu = &per_cpu(cpuinfo, cpu);if (!down_read_trylock(&pcpu->enable_sem))continue;if (!pcpu->governor_enabled) {up_read(&pcpu->enable_sem);continue;}for_each_cpu(j, pcpu->policy->cpus) {struct cpufreq_interactive_cpuinfo *pjcpu =&per_cpu(cpuinfo, j);if (pjcpu->target_freq > max_freq)max_freq = pjcpu->target_freq;}if (max_freq != pcpu->policy->cur)__cpufreq_driver_target(pcpu->policy,max_freq,CPUFREQ_RELATION_H);trace_cpufreq_interactive_setspeed(cpu,pcpu->target_freq,pcpu->policy->cur);up_read(&pcpu->enable_sem);}}return 0; }

這個函數比較簡單，在一個while循環中，遍歷speedchange_cpumask相關的CPU，然后再次遍歷所有online CPU，得到最大的target_freq，將target_freq賦值給max_freq，即我們需要設置的CPU頻率.
若max_freq != pcpu->policy->cur,說明當前頻率不等于我們需要設置的頻率，調用__cpufreq_driver_target完成頻率設置.
__cpufreq_driver_target會調用對應的callback完成頻率設置，具體和cpufreq driver相關，需要driver工程師根據自己的平臺實現.

回顧一下之前的工作，我們分析了interactive governor的創建，初始化
如果CPUFREQ core想要啟用interactive governor，就要調用interactive governor提供的interface——.governor
在這個callback中，分析了governor在policy方面的初始化，start一個governor，然后調頻的工作就交給了定時器（定時器在start governor的時候被啟動）.
在定時器中，計算cpu_load，然后根據cpu_load來選頻，然后更新pcpu的一些數據，選頻得到的頻率交由CPUFREQ driver來設置到硬件中去.

接下去一個是：CPUFREQ_GOV_STOP

case CPUFREQ_GOV_STOP:mutex_lock(&gov_lock);for_each_cpu(j, policy->cpus) {pcpu = &per_cpu(cpuinfo, j);down_write(&pcpu->enable_sem);pcpu->governor_enabled = 0;del_timer_sync(&pcpu->cpu_timer);del_timer_sync(&pcpu->cpu_slack_timer);up_write(&pcpu->enable_sem);}mutex_unlock(&gov_lock);break;

遍歷所有online的cpu：
獲取cpuinfo
設置pcpu->governor_enabled為0
刪除兩個定時器

下一個是：CPUFREQ_GOV_LIMITS

case CPUFREQ_GOV_LIMITS:if (policy->max < policy->cur)__cpufreq_driver_target(policy,policy->max, CPUFREQ_RELATION_H);else if (policy->min > policy->cur)__cpufreq_driver_target(policy,policy->min, CPUFREQ_RELATION_L);for_each_cpu(j, policy->cpus) {pcpu = &per_cpu(cpuinfo, j);down_read(&pcpu->enable_sem);if (pcpu->governor_enabled == 0) {up_read(&pcpu->enable_sem);continue;}spin_lock_irqsave(&pcpu->target_freq_lock, flags);if (policy->max < pcpu->target_freq)pcpu->target_freq = policy->max;else if (policy->min > pcpu->target_freq)pcpu->target_freq = policy->min;spin_unlock_irqrestore(&pcpu->target_freq_lock, flags);up_read(&pcpu->enable_sem);/* Reschedule timer only if policy->max is raised.* Delete the timers, else the timer callback may* return without re-arm the timer when failed* acquire the semaphore. This race may cause timer* stopped unexpectedly.*/if (policy->max > pcpu->max_freq) {down_write(&pcpu->enable_sem);del_timer_sync(&pcpu->cpu_timer);del_timer_sync(&pcpu->cpu_slack_timer);cpufreq_interactive_timer_start(tunables, j);up_write(&pcpu->enable_sem);}pcpu->max_freq = policy->max;}break;

這個我沒怎么看，應該是做一些限制，

該event被調用的場景是：change or update limits.
當policy的max或min被改變時，會調用cpufreq_update_policy—>cpufreq_set_policy—>__cpufreq_governor，在__cpufreq_governor中policy->governor->governor調用governor的governor callback
然后進入CPUFREQ_GOV_LIMITS
此時傳入cpufreq_governor_interactive的policy指針已經是min或max被改變后的新policy了
對于新policy的處理如下：
改變當前頻率，使其符合新policy的范圍
遍歷所有online CPU：
判斷pcpu->target_freq的值，確保其在新policy的范圍內
如果之前的policy->max，即pcpu->max_freq小于新的policy->max,那么刪除兩個定時器鏈表
調用cpufreq_interactive_timer_start,重新add定時器
將pcpu->max_freq的值更新為新policy的最大值
?

總結

以上是生活随笔為你收集整理的CPU中的DVFS的全部內容，希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網站內容還不錯，歡迎將生活随笔推薦給好友。

cpu
DVFS

上一篇： Linux命令行设置行数,Linux设置
下一篇：【转】performSelector延时