2 * This file is part of the Palacios Virtual Machine Monitor developed
3 * by the V3VEE Project with funding from the United States National
4 * Science Foundation and the Department of Energy.
6 * The V3VEE Project is a joint project between Northwestern University
7 * and the University of New Mexico. You can find out more at
10 * Copyright (c) 2014, the V3VEE Project <http://www.v3vee.org>
11 * all rights reserved.
13 * Author: Kyle C. Hale <kh@u.northwestern.edu>
14 * Shiva Rao <shiva.rao.717@gmail.com>
15 * Peter Dinda <pdinda@northwestern.edu>
17 * This is free software. you are permitted to use,
18 * redistribute, and modify it as specified in the file "V3VEE_LICENSE".
21 #include <linux/uaccess.h>
22 #include <linux/seq_file.h>
23 #include <linux/proc_fs.h>
24 #include <linux/cpufreq.h>
25 #include <linux/kernel.h>
26 #include <linux/kmod.h>
27 #include <linux/module.h>
28 #include <linux/string.h>
29 #include <linux/interrupt.h>
30 #include <asm/processor.h>
32 #include <asm/msr-index.h>
34 // Used to determine the appropriate pstates values on Intel
35 #include <linux/acpi.h>
36 #include <acpi/processor.h>
38 #include <interfaces/vmm_pstate_ctrl.h>
41 #include "iface-pstate-ctrl.h"
43 #include "linux-exts.h"
46 This P-STATE control implementation includes:
48 - Direct control of Intel and AMD processor pstates
49 - External control of processor states via Linux (unimplemented)
50 - Internal control of processor states in Palacios (handoff from Linux)
52 Additionally, it provides a user-space interface for manipulating
53 p-state regardless of the host's functionality. This includes
54 an ioctl for commanding the implementation and a /proc file for
55 showing current status and capabilities.
57 What we mean by "pstate" here is the processor's internal
58 configuration. For AMD, this is defined as being the same as
59 the ACPI-defined p-state. For Intel, it is not. There, it is the
60 contents of the perf ctl MSR, which, often, is the frequency id
61 and voltage id (the multipliers).
66 #define PALACIOS_GOVNAME "v3vee"
67 #define MAX_PATH_LEN 128
68 #define MAX_GOV_NAME_LEN 16
71 struct pstate_core_info {
72 // Here we have the notion of host control
73 #define V3_PSTATE_HOST_CONTROL 0
74 // and all the modes from the Palacios interface:
75 // V3_PSTATE_EXTERNAL_CONTROL
76 // V3_PSTATE_DIRECT_CONTROL
77 // V3_PSTATE_INTERNAL_CONTROL
80 // Apply if we are under the DIRECT state
85 uint8_t cur_hw_pstate;
87 // Apply if we are under the EXTERNAL state
88 uint64_t cur_freq_khz;
89 uint64_t max_freq_khz;
90 uint64_t min_freq_khz;
93 uint8_t prior_speedstep;
94 uint8_t turbo_disabled;
99 // This is where we stash Linux's governor when we make a mode switch
100 char * linux_governor;
101 // We have this so we can restore the original frequency when we started
102 uint64_t original_hz;
107 static DEFINE_PER_CPU(struct pstate_core_info, core_state);
111 // These are used to assert DIRECT control over the core pstates
112 struct pstate_core_funcs {
113 void (*arch_init)(void);
114 void (*arch_deinit)(void);
115 uint64_t (*get_min_pstate)(void);
116 uint64_t (*get_max_pstate)(void);
117 uint64_t (*get_pstate)(void);
118 void (*set_pstate)(uint64_t pstate);
121 struct pstate_machine_info {
122 enum {INTEL, AMD, OTHER } arch;
123 int supports_pstates;
133 int have_opportunistic; // this means "Turbo Boost" or "IDA"
134 int have_policy_hint;
135 int have_hwp; // hardware-controlled performance states
136 int have_hdc; // hardware duty cycling
137 int have_mwait_ext; // mwait power extensions
138 int have_mwait_int; // mwait wakes on interrupt
141 int have_pstate_hw_coord; // mperf/aperf
143 // used for DIRECT control
144 struct pstate_core_funcs *funcs;
148 static struct pstate_machine_info machine_state;
151 /****************************************************
153 ***************************************************/
155 /* AMD Programmer's Manual Vol 2 (Rev 3, 2013), Sec. 17.1, pp.557 */
156 #define MSR_PSTATE_LIMIT_REG_AMD 0xc0010061
157 #define MSR_PSTATE_CTL_REG_AMD 0xc0010062
158 #define MSR_PSTATE_STAT_REG_AMD 0xc0010063
160 struct p_state_limit_reg_amd {
164 uint8_t pstate_limit : 4; /* lowest P-state value (highest perf.) supported currently (this can change at runtime) */
165 uint8_t pstate_max : 4; /* highest P-state value supported (lowest perf) */
168 } __attribute__((packed));
169 } __attribute__((packed));
172 struct p_state_stat_reg_amd {
179 } __attribute__((packed));
180 } __attribute__((packed));
183 struct p_state_ctl_reg_amd {
190 } __attribute__((packed));
191 } __attribute__((packed));
194 /* CPUID Fn8000_0007_EDX[HwPstate(7)] = 1 */
195 static uint8_t supports_pstates_amd (void)
197 uint32_t eax, ebx, ecx, edx;
199 cpuid(0x80000007, &eax, &ebx, &ecx, &edx);
200 machine_state.have_pstate = !!(edx & (1 << 7));
201 machine_state.have_coreboost = !!(edx & (1<<9));
202 machine_state.have_feedback = !!(edx & (1<<11));
204 cpuid(0x6, &eax, &ebx, &ecx, &edx);
205 machine_state.have_pstate_hw_coord = !!(ecx & 1);
207 INFO("P-State: AMD: Pstates=%d Coreboost=%d Feedback=%d PstateHWCoord=%d\n",
208 machine_state.have_pstate,
209 machine_state.have_coreboost,
210 machine_state.have_feedback,
211 machine_state.have_pstate_hw_coord);
213 return machine_state.have_pstate;
219 static void init_arch_amd(void)
221 /* KCH: nothing to do here */
225 static void deinit_arch_amd(void)
227 /* KCH: nothing to do here */
231 static uint64_t get_pstate_amd(void)
233 struct p_state_stat_reg_amd pstat;
235 rdmsrl(MSR_PSTATE_STAT_REG_AMD, pstat.val);
237 get_cpu_var(core_state).cur_pstate=pstat.reg.pstate;
238 put_cpu_var(core_state);
240 return pstat.reg.pstate;
244 static void set_pstate_amd(uint64_t p)
246 struct p_state_ctl_reg_amd pctl;
250 wrmsrl(MSR_PSTATE_CTL_REG_AMD, pctl.val);
252 get_cpu_var(core_state).cur_pstate=p;
253 put_cpu_var(core_state);
258 * NOTE: HW may change this value at runtime
260 static uint64_t get_max_pstate_amd(void)
262 struct p_state_limit_reg_amd plimits;
264 rdmsrl(MSR_PSTATE_LIMIT_REG_AMD, plimits.val);
266 return plimits.reg.pstate_max;
270 static uint64_t get_min_pstate_amd(void)
272 struct p_state_limit_reg_amd plimits;
274 rdmsrl(MSR_PSTATE_LIMIT_REG_AMD, plimits.val);
276 return plimits.reg.pstate_limit;
280 static struct pstate_core_funcs amd_funcs =
282 .arch_init = init_arch_amd,
283 .arch_deinit = deinit_arch_amd,
284 .get_pstate = get_pstate_amd,
285 .set_pstate = set_pstate_amd,
286 .get_max_pstate = get_max_pstate_amd,
287 .get_min_pstate = get_min_pstate_amd,
292 /***********************************************************
294 **********************************************************/
298 This implementation uses SpeedStep, but does check
299 to see if the other features (MPERF/APERF, Turbo/IDA, HWP)
303 /* Intel System Programmer's Manual Vol. 3B, 14-2 */
304 #define MSR_MPERF_IA32 0x000000e7
305 #define MSR_APERF_IA32 0x000000e8
306 #define MSR_MISC_ENABLE_IA32 0x000001a0
307 #define MSR_NHM_TURBO_RATIO_LIMIT 0x000001ad
308 #define MSR_PLATFORM_INFO_IA32 0x000000ce
309 #define MSR_PERF_CTL_IA32 0x00000199
310 #define MSR_PERF_STAT_IA32 0x00000198
311 #define MSR_ENERY_PERF_BIAS_IA32 0x000001b0
314 /* Note that the actual meaning of the pstate
315 in the control and status registers is actually
316 implementation dependent, unlike AMD. The "official"
317 way to figure it out the mapping from pstate to
318 these values is via ACPI. What is written in the register
319 is an "id" of an operation point
321 "Often", the 16 bit field consists of a high order byte
322 which is the frequency (the multiplier) and the low order
325 // MSR_PERF_CTL_IA32 r/w
326 struct perf_ctl_reg_intel {
330 // This is the target
331 // Note, not the ACPI pstate, but
332 // Intel's notion of pstate is that it's opaque
333 // for lots of implementations it seems to be
334 // frequency_id : voltage_id
335 // where frequency_id is typically the multiplier
336 uint16_t pstate : 16;
337 uint16_t reserved : 16;
338 // set to 1 to *disengage* dynamic acceleration
339 // Note that "IDA" and "Turbo" use the same interface
340 uint16_t dynamic_accel_disable : 1;
341 uint32_t reserved2 : 31;
343 } __attribute__((packed));
344 } __attribute__((packed));
346 // MSR_PERF_STAT_IA32 r
347 struct perf_stat_reg_intel {
351 // this is the current
352 uint16_t pstate : 16;
353 uint64_t reserved : 48;
355 } __attribute__((packed));
356 } __attribute__((packed));
358 // MSR_ENERGY_PERF_BIAS_IA32 r/w
359 struct enery_perf_bias_reg_intel {
363 // this is the current
364 uint8_t policy_hint : 4;
365 uint64_t reserved : 60;
367 } __attribute__((packed));
368 } __attribute__((packed));
371 struct turbo_mode_info_reg_intel {
376 uint8_t max_noturbo_ratio : 8;
378 uint8_t ppin_cap : 1;
380 uint8_t ratio_limit : 1;
381 uint8_t tdc_tdp_limit : 1;
383 uint8_t min_ratio : 8;
386 } __attribute__((packed));
387 } __attribute__((packed));
389 // This replicates the critical information in Linux's struct acpi_processor_px
390 // To make it easier to port to other OSes.
391 struct intel_pstate_info {
392 uint64_t freq; // KHz
393 uint64_t ctrl; // What to write into the _CTL MSR to get this
396 // The internal array will be used if we cannot build the table locally
397 static struct intel_pstate_info *intel_pstate_to_ctrl_internal=0;
398 static int intel_num_pstates_internal=0;
400 // These will either point to the internal array or to a constructed array
401 static struct intel_pstate_info *intel_pstate_to_ctrl=0;
402 static int intel_num_pstates=0;
405 /* CPUID.01:ECX.AES(7) */
406 static uint8_t supports_pstates_intel(void)
408 /* NOTE: CPUID.06H:ECX.SETBH[bit 3] is set and it also implies the presence of a new architectural MSR called IA32_ENERGY_PERF_BIAS (1B0H).
410 uint32_t eax, ebx, ecx, edx;
412 cpuid(0x1, &eax, &ebx, &ecx, &edx);
413 machine_state.have_speedstep = !!(ecx & (1 << 7));
415 cpuid(0x6, &eax, &ebx, &ecx, &edx);
416 machine_state.have_pstate_hw_coord = !!(ecx & 1); // ?
417 machine_state.have_opportunistic = !!(eax & 1<<1);
418 machine_state.have_policy_hint = !!(ecx & 1<<3);
419 machine_state.have_hwp = !!(eax & 1<<7);
420 machine_state.have_hdc = !!(eax & 1<<13);
422 cpuid(0x5, &eax, &ebx, &ecx, &edx);
423 machine_state.have_mwait_ext = !!(ecx & 1);
424 machine_state.have_mwait_int = !!(ecx & 1<<1);
427 INFO("P-State: Intel: Speedstep=%d, PstateHWCoord=%d, Opportunistic=%d PolicyHint=%d HWP=%d HDC=%d, MwaitExt=%d MwaitInt=%d \n",
428 machine_state.have_speedstep,
429 machine_state.have_pstate_hw_coord,
430 machine_state.have_opportunistic,
431 machine_state.have_policy_hint,
432 machine_state.have_hwp,
433 machine_state.have_hdc,
434 machine_state.have_mwait_ext,
435 machine_state.have_mwait_int );
438 if (machine_state.have_speedstep) {
440 // Build mapping table (from "pstate" (0..) to ctrl value for MSR
441 if (!(get_cpu_var(processors)) || !(get_cpu_var(processors)->performance) ) {
442 put_cpu_var(processors);
443 // no acpi... revert to internal table
444 intel_pstate_to_ctrl=intel_pstate_to_ctrl_internal;
445 intel_num_pstates=intel_num_pstates_internal;
447 intel_num_pstates = get_cpu_var(processors)->performance->state_count;
448 if (intel_num_pstates) {
449 intel_pstate_to_ctrl = palacios_alloc(sizeof(struct intel_pstate_info)*intel_num_pstates);
450 if (!intel_pstate_to_ctrl) {
451 ERROR("P-State: Cannot allocate space for mapping...\n");
454 for (i=0;i<intel_num_pstates;i++) {
455 intel_pstate_to_ctrl[i].freq = get_cpu_var(processors)->performance->states[i].core_frequency*1000;
456 intel_pstate_to_ctrl[i].ctrl = get_cpu_var(processors)->performance->states[i].control;
460 ERROR("P-State: Strange, machine has ACPI DVFS but no states...\n");
463 put_cpu_var(processors);
464 INFO("P-State: Intel - State Mapping (%u states) follows\n",intel_num_pstates);
465 for (i=0;i<intel_num_pstates;i++) {
466 INFO("P-State: Intel Mapping %u: freq=%llu ctrl=%llx\n",
467 i, intel_pstate_to_ctrl[i].freq,intel_pstate_to_ctrl[i].ctrl);
470 INFO("P-State: Intel: No speedstep here\n");
474 return machine_state.have_speedstep;
478 static void init_arch_intel(void)
482 rdmsrl(MSR_MISC_ENABLE_IA32, val);
484 //INFO("P-State: prior ENABLE=%llx\n",val);
486 // store prior speedstep setting
487 get_cpu_var(core_state).prior_speedstep=(val >> 16) & 0x1;
488 put_cpu_var(core_state);
490 // enable speedstep (probably already on)
492 wrmsrl(MSR_MISC_ENABLE_IA32, val);
494 //INFO("P-State: write ENABLE=%llx\n",val);
498 static void deinit_arch_intel(void)
502 rdmsrl(MSR_MISC_ENABLE_IA32, val);
504 //INFO("P-State: deinit: ENABLE=%llx\n",val);
506 val &= ~(1ULL << 16);
507 val |= get_cpu_var(core_state).prior_speedstep << 16;
508 put_cpu_var(core_state);
510 wrmsrl(MSR_MISC_ENABLE_IA32, val);
512 //INFO("P-state: deinit ENABLE=%llx\n",val);
516 /* TODO: Intel P-states require sampling at intervals... */
517 static uint64_t get_pstate_intel(void)
521 rdmsrl(MSR_PERF_STAT_IA32,val);
523 //INFO("P-State: Get: 0x%llx\n", val);
525 // should check if turbo is active, in which case
526 // this value is not the whole story
531 static void set_pstate_intel(uint64_t p)
536 if (intel_num_pstates==0) {
539 if (p>=intel_num_pstates) {
540 p=intel_num_pstates-1;
544 ctrl=intel_pstate_to_ctrl[p].ctrl;
546 /* ...Intel IDA (dynamic acceleration)
547 if (c->no_turbo && !c->turbo_disabled) {
551 // leave all bits along expect for the likely
554 rdmsrl(MSR_PERF_CTL_IA32, val);
555 INFO("P-State: Pre-Set: 0x%llx\n", val);
558 val |= ctrl & 0xffffULL;
560 INFO("P-State: Set: 0x%llx\n", val);
562 wrmsrl(MSR_PERF_CTL_IA32, val);
564 get_cpu_var(core_state).cur_pstate = p;
565 put_cpu_var(core_state);
569 static uint64_t get_min_pstate_intel(void)
576 static uint64_t get_max_pstate_intel (void)
578 if (intel_num_pstates==0) {
581 return intel_num_pstates-1;
585 static struct pstate_core_funcs intel_funcs =
587 .arch_init = init_arch_intel,
588 .arch_deinit = deinit_arch_intel,
589 .get_pstate = get_pstate_intel,
590 .set_pstate = set_pstate_intel,
591 .get_max_pstate = get_max_pstate_intel,
592 .get_min_pstate = get_min_pstate_intel,
597 /***********************************************
598 Arch determination and setup
599 ***********************************************/
601 static inline void cpuid_string (uint32_t id, uint32_t dest[4])
604 :"=a"(*dest),"=b"(*(dest+1)),"=c"(*(dest+2)),"=d"(*(dest+3))
609 static int get_cpu_vendor (char name[13])
614 cpuid_string(0,dest);
616 ((uint32_t*)name)[0]=dest[1];
617 ((uint32_t*)name)[1]=dest[3];
618 ((uint32_t*)name)[2]=dest[2];
625 static int is_intel (void)
628 get_cpu_vendor(name);
629 return !strcmp(name,"GenuineIntel");
633 static int is_amd (void)
636 get_cpu_vendor(name);
637 return !strcmp(name,"AuthenticAMD");
640 static int pstate_arch_setup(void)
644 machine_state.arch = AMD;
645 machine_state.funcs = &amd_funcs;
646 machine_state.supports_pstates = supports_pstates_amd();
647 INFO("PSTATE: P-State initialized for AMD\n");
648 } else if (is_intel()) {
649 machine_state.arch = INTEL;
650 machine_state.funcs = &intel_funcs;
651 machine_state.supports_pstates = supports_pstates_intel();
652 INFO("PSTATE: P-State initialized for INTEL (Work in progress...)\n");
656 machine_state.arch = OTHER;
657 machine_state.funcs = NULL;
658 machine_state.supports_pstates = 0;
659 INFO("PSTATE: P-state control: No support for direct control on this architecture\n");
668 /******************************************************************
670 *****************************************************************/
675 * This stub governor is simply a placeholder for preventing
676 * frequency changes from the Linux side. For now, we simply leave
677 * the frequency as is when we acquire control.
679 static int governor_run(struct cpufreq_policy *policy, unsigned int event)
683 /* we can't use cpufreq_driver_target here as it can result
684 * in a circular dependency, so we'll just do nothing.
686 case CPUFREQ_GOV_START:
687 case CPUFREQ_GOV_STOP:
688 case CPUFREQ_GOV_LIMITS:
692 ERROR("Undefined governor command\n");
700 static struct cpufreq_governor stub_governor =
702 .name = PALACIOS_GOVNAME,
703 .governor = governor_run,
704 .owner = THIS_MODULE,
708 static struct workqueue_struct *pstate_wq;
712 struct work_struct work;
716 static inline void pstate_register_linux_governor(void)
718 cpufreq_register_governor(&stub_governor);
722 static inline void pstate_unregister_linux_governor(void)
724 cpufreq_unregister_governor(&stub_governor);
728 static int pstate_linux_init(void)
730 pstate_register_linux_governor();
731 pstate_wq = create_workqueue("v3vee_pstate_wq");
733 ERROR("Could not create work queue\n");
740 pstate_unregister_linux_governor();
745 static void pstate_linux_deinit(void)
747 pstate_unregister_linux_governor();
748 flush_workqueue(pstate_wq);
749 destroy_workqueue(pstate_wq);
753 static int get_current_governor(char **buf, unsigned int cpu)
755 struct cpufreq_policy * policy = palacios_alloc(sizeof(struct cpufreq_policy));
756 char * govname = NULL;
759 ERROR("could not allocate cpufreq_policy\n");
763 if (cpufreq_get_policy(policy, cpu) != 0) {
764 ERROR("Could not get current cpufreq policy\n");
768 /* We're in interrupt context, should probably not wait here */
769 govname = palacios_alloc(MAX_GOV_NAME_LEN);
771 ERROR("Could not allocate space for governor name\n");
775 strncpy(govname, policy->governor->name, MAX_GOV_NAME_LEN);
777 get_cpu_var(core_state).linux_governor = govname;
778 put_cpu_var(core_state);
782 palacios_free(policy);
787 palacios_free(policy);
792 /* passed to the userspacehelper interface for cleanup */
793 static void gov_switch_cleanup(struct subprocess_info * s)
795 palacios_free(s->argv[2]);
796 palacios_free(s->argv);
802 * @s - the governor to switch to
803 * TODO: this should probably be submitted to a work queue
804 * so we don't have to run it in interrupt context
806 static int governor_switch(char * s, unsigned int cpu)
808 char * path_str = NULL;
811 static char * envp[] = {
814 "PATH=/sbin:/bin:/usr/sbin:/usr/bin", NULL };
817 argv = palacios_alloc(4*sizeof(char*));
819 ERROR("Couldn't allocate argv struct\n");
823 path_str = palacios_alloc(MAX_PATH_LEN);
825 ERROR("Couldn't allocate path string\n");
828 memset(path_str, 0, MAX_PATH_LEN);
830 snprintf(path_str, MAX_PATH_LEN, "echo %s > /sys/devices/system/cpu/cpu%u/cpufreq/scaling_governor", s, cpu);
837 /* KCH: we can't wait here to actually see if we succeeded, we're in interrupt context */
838 return call_usermodehelper_fns("/bin/sh", argv, envp, UMH_NO_WAIT, NULL, gov_switch_cleanup, NULL);
846 static inline void free_linux_governor(void)
848 palacios_free(get_cpu_var(core_state).linux_governor);
849 put_cpu_var(core_state);
853 static int linux_setup_palacios_governor(void)
856 unsigned int cpu = get_cpu();
858 /* KCH: we assume the v3vee governor is already
859 * registered with kernel by this point
862 if (get_current_governor(&gov, cpu) < 0) {
863 ERROR("Could not get current governor\n");
867 DEBUG("saving current governor (%s)\n", gov);
869 get_cpu_var(core_state).linux_governor = gov;
870 put_cpu_var(core_state);
872 DEBUG("setting the new governor (%s)\n", PALACIOS_GOVNAME);
874 /* set the new one to ours */
875 if (governor_switch(PALACIOS_GOVNAME, cpu) < 0) {
876 ERROR("Could not set governor to (%s)\n", PALACIOS_GOVNAME);
885 static int linux_get_pstate(void)
887 struct cpufreq_policy * policy = NULL;
888 struct cpufreq_frequency_table *table;
891 unsigned int count = 0;
894 policy = palacios_alloc(sizeof(struct cpufreq_policy));
896 ERROR("Could not allocate policy struct\n");
900 cpufreq_get_policy(policy, cpu);
901 table = cpufreq_frequency_get_table(cpu);
903 for (i = 0; (table[i].frequency != CPUFREQ_TABLE_END); i++) {
905 if (table[i].frequency == CPUFREQ_ENTRY_INVALID) {
909 if (table[i].frequency == policy->cur) {
916 palacios_free(policy);
923 static int linux_get_freq(void)
925 struct cpufreq_policy * policy = NULL;
928 policy = palacios_alloc(sizeof(struct cpufreq_policy));
930 ERROR("Could not allocate policy struct\n");
934 if (cpufreq_get_policy(policy, cpu)) {
935 ERROR("Could not get current policy\n");
943 pstate_switch_workfn (struct work_struct *work)
945 pstate_work_t * pwork = (pstate_work_t*)work;
946 struct cpufreq_policy * policy = NULL;
950 policy = palacios_alloc(sizeof(struct cpufreq_policy));
952 ERROR("Could not allocate space for cpufreq policy\n");
956 if (cpufreq_get_policy(policy, cpu) != 0) {
957 ERROR("Could not get cpufreq policy\n");
961 INFO("P-state: setting frequency on core %u to %llu\n", cpu, pwork->freq);
962 cpufreq_driver_target(policy, pwork->freq, CPUFREQ_RELATION_H);
964 get_cpu_var(core_state).cur_freq_khz = pwork->freq;
965 put_cpu_var(core_state);
968 palacios_free(policy);
974 static int linux_set_pstate(uint8_t p)
976 struct cpufreq_policy * policy = NULL;
977 struct cpufreq_frequency_table *table;
978 pstate_work_t * work = NULL;
981 unsigned int count = 0;
986 policy = palacios_alloc(sizeof(struct cpufreq_policy));
988 ERROR("Could not allocate policy struct\n");
992 work = (pstate_work_t*)palacios_alloc(sizeof(pstate_work_t));
994 ERROR("Could not allocate work struct\n");
998 if (cpufreq_get_policy(policy, cpu)) {
999 ERROR("Could not get current policy\n");
1002 table = cpufreq_frequency_get_table(cpu);
1004 for (i = 0; (table[i].frequency != CPUFREQ_TABLE_END); i++) {
1006 if (table[i].frequency == CPUFREQ_ENTRY_INVALID) {
1012 INIT_WORK((struct work_struct*)work, pstate_switch_workfn);
1013 work->freq = table[i].frequency;
1014 queue_work(pstate_wq, (struct work_struct*)work);
1024 /* we need to deal with the case in which we get a number > max pstate */
1026 INIT_WORK((struct work_struct*)work, pstate_switch_workfn);
1027 work->freq = table[last_valid].frequency;
1028 queue_work(pstate_wq, (struct work_struct*)work);
1031 palacios_free(policy);
1035 palacios_free(work);
1037 palacios_free(policy);
1042 static int linux_set_freq(uint64_t f)
1044 struct cpufreq_policy * policy = NULL;
1045 pstate_work_t * work = NULL;
1047 int cpu = get_cpu();
1050 policy = palacios_alloc(sizeof(struct cpufreq_policy));
1052 ERROR("Could not allocate policy struct\n");
1056 work = (pstate_work_t*)palacios_alloc(sizeof(pstate_work_t));
1058 ERROR("Could not allocate work struct\n");
1062 if (cpufreq_get_policy(policy, cpu) != 0) {
1063 ERROR("Could not get cpufreq policy\n");
1067 if (f < policy->min) {
1069 } else if (f > policy->max) {
1075 INIT_WORK((struct work_struct*)work, pstate_switch_workfn);
1077 queue_work(pstate_wq, (struct work_struct*)work);
1079 palacios_free(policy);
1083 palacios_free(work);
1085 palacios_free(policy);
1090 static int linux_restore_defaults(void)
1092 unsigned int cpu = get_cpu();
1095 gov = get_cpu_var(core_state).linux_governor;
1096 put_cpu_var(core_state);
1098 DEBUG("restoring previous governor (%s)\n", gov);
1100 if (governor_switch(gov, cpu) < 0) {
1101 ERROR("Could not restore governor to (%s)\n", gov);
1105 free_linux_governor();
1109 free_linux_governor();
1115 /******************************************************************
1116 Generic Interface as provided to Palacios and to the rest of the
1118 ******************************************************************/
1120 static void init_core(void)
1123 struct cpufreq_policy *p;
1127 DEBUG("P-State Core Init\n");
1129 get_cpu_var(core_state).mode = V3_PSTATE_HOST_CONTROL;
1130 get_cpu_var(core_state).cur_pstate = 0;
1132 if (machine_state.funcs) {
1133 get_cpu_var(core_state).min_pstate = machine_state.funcs->get_min_pstate();
1134 get_cpu_var(core_state).max_pstate = machine_state.funcs->get_max_pstate();
1136 get_cpu_var(core_state).min_pstate = 0;
1137 get_cpu_var(core_state).max_pstate = 0;
1141 cpu = get_cpu(); put_cpu();
1143 p = cpufreq_cpu_get(cpu);
1146 get_cpu_var(core_state).have_cpufreq = 0;
1147 get_cpu_var(core_state).min_freq_khz=0;
1148 get_cpu_var(core_state).max_freq_khz=0;
1149 get_cpu_var(core_state).cur_freq_khz=0;
1151 get_cpu_var(core_state).have_cpufreq = 1;
1152 get_cpu_var(core_state).min_freq_khz=p->min;
1153 get_cpu_var(core_state).max_freq_khz=p->max;
1154 get_cpu_var(core_state).cur_freq_khz=p->cur; } cpufreq_cpu_put(p);
1155 put_cpu_var(core_state);
1157 for (i=0;i<get_cpu_var(processors)->performance->state_count; i++) {
1158 INFO("P-State: %u: freq=%llu ctrl=%llx",
1160 get_cpu_var(processors)->performance->states[i].core_frequency*1000,
1161 get_cpu_var(processors)->performance->states[i].control);
1163 put_cpu_var(processors);
1167 void palacios_pstate_ctrl_release(void);
1170 static void deinit_core(void)
1172 DEBUG("P-State Core Deinit\n");
1173 palacios_pstate_ctrl_release();
1179 void palacios_pstate_ctrl_get_chars(struct v3_cpu_pstate_chars *c)
1181 memset(c,0,sizeof(struct v3_cpu_pstate_chars));
1184 c->features = V3_PSTATE_INTERNAL_CONTROL;
1186 if (get_cpu_var(core_state).have_cpufreq) {
1187 c->features |= V3_PSTATE_EXTERNAL_CONTROL;
1190 if (machine_state.arch==AMD || machine_state.arch==INTEL) {
1191 c->features |= V3_PSTATE_DIRECT_CONTROL;
1193 c->cur_mode = get_cpu_var(core_state).mode;
1194 c->min_pstate = get_cpu_var(core_state).min_pstate;
1195 c->max_pstate = get_cpu_var(core_state).max_pstate;
1196 c->cur_pstate = get_cpu_var(core_state).cur_pstate;
1197 c->min_freq_khz = get_cpu_var(core_state).min_freq_khz;
1198 c->max_freq_khz = get_cpu_var(core_state).max_freq_khz;
1199 c->cur_freq_khz = get_cpu_var(core_state).cur_freq_khz;
1201 put_cpu_var(core_state);
1208 uint64_t palacios_pstate_ctrl_get_pstate(void)
1210 if (get_cpu_var(core_state).mode==V3_PSTATE_DIRECT_CONTROL) {
1211 put_cpu_var(core_state);
1212 return machine_state.funcs->get_pstate();
1213 } else if (get_cpu_var(core_state).mode==V3_PSTATE_EXTERNAL_CONTROL) {
1214 put_cpu_var(core_state);
1215 return linux_get_pstate();
1217 put_cpu_var(core_state);
1223 void palacios_pstate_ctrl_set_pstate(uint64_t p)
1225 if (get_cpu_var(core_state).mode==V3_PSTATE_DIRECT_CONTROL) {
1226 put_cpu_var(core_state);
1227 machine_state.funcs->set_pstate(p);
1228 } else if (get_cpu_var(core_state).mode==V3_PSTATE_EXTERNAL_CONTROL) {
1229 put_cpu_var(core_state);
1230 linux_set_pstate(p);
1232 put_cpu_var(core_state);
1237 void palacios_pstate_ctrl_set_pstate_wrapper(void *p)
1239 palacios_pstate_ctrl_set_pstate((uint8_t)(uint64_t)p);
1243 uint64_t palacios_pstate_ctrl_get_freq(void)
1245 if (get_cpu_var(core_state).mode==V3_PSTATE_EXTERNAL_CONTROL) {
1246 put_cpu_var(core_state);
1247 return linux_get_freq();
1249 put_cpu_var(core_state);
1255 void palacios_pstate_ctrl_set_freq(uint64_t p)
1257 if (get_cpu_var(core_state).mode==V3_PSTATE_EXTERNAL_CONTROL) {
1258 put_cpu_var(core_state);
1261 put_cpu_var(core_state);
1266 static int switch_to_external(void)
1268 DEBUG("switch from host control to external\n");
1270 if (!(get_cpu_var(core_state).have_cpufreq)) {
1271 put_cpu_var(core_state);
1272 ERROR("No cpufreq - cannot switch to external...\n");
1275 put_cpu_var(core_state);
1277 linux_setup_palacios_governor();
1279 get_cpu_var(core_state).mode=V3_PSTATE_EXTERNAL_CONTROL;
1280 put_cpu_var(core_state);
1286 static int switch_to_direct(void)
1288 DEBUG("switch from host control to direct\n");
1290 if (get_cpu_var(core_state).have_cpufreq) {
1291 put_cpu_var(core_state);
1292 DEBUG("switch to direct from cpufreq\n");
1294 // The implementation would set the policy and governor to peg cpu
1295 // regardless of load
1296 linux_setup_palacios_governor();
1298 put_cpu_var(core_state);
1301 if (machine_state.funcs && machine_state.funcs->arch_init) {
1302 get_cpu_var(core_state).mode=V3_PSTATE_DIRECT_CONTROL;
1304 machine_state.funcs->arch_init();
1306 put_cpu_var(core_state);
1313 static int switch_to_internal(void)
1315 DEBUG("switch from host control to internal\n");
1317 if (get_cpu_var(core_state).have_cpufreq) {
1318 put_cpu_var(core_state);
1319 DEBUG("switch to internal on machine with cpu freq\n");
1320 linux_setup_palacios_governor();
1322 put_cpu_var(core_state);
1325 get_cpu_var(core_state).mode=V3_PSTATE_INTERNAL_CONTROL;
1327 put_cpu_var(core_state);
1333 static int switch_from_external(void)
1335 if (!(get_cpu_var(core_state).have_cpufreq)) {
1336 put_cpu_var(core_state);
1337 ERROR("No cpufreq - how did we get here... external...\n");
1340 put_cpu_var(core_state);
1342 DEBUG("Switching back to host control from external\n");
1344 if (get_cpu_var(core_state).have_cpufreq) {
1345 put_cpu_var(core_state);
1346 linux_restore_defaults();
1348 put_cpu_var(core_state);
1351 get_cpu_var(core_state).mode = V3_PSTATE_HOST_CONTROL;
1352 put_cpu_var(core_state);
1358 static int switch_from_direct(void)
1361 DEBUG("Switching back to host control from direct\n");
1363 // Set maximum performance, just in case there is no host control
1364 machine_state.funcs->set_pstate(get_cpu_var(core_state).min_pstate);
1365 machine_state.funcs->arch_deinit();
1367 if (get_cpu_var(core_state).have_cpufreq) {
1368 put_cpu_var(core_state);
1369 linux_restore_defaults();
1371 put_cpu_var(core_state);
1374 get_cpu_var(core_state).mode=V3_PSTATE_HOST_CONTROL;
1376 put_cpu_var(core_state);
1382 static int switch_from_internal(void)
1384 DEBUG("Switching back to host control from internal\n");
1386 if (get_cpu_var(core_state).have_cpufreq) {
1387 put_cpu_var(core_state);
1388 // ERROR("Unimplemented: switch from internal on machine with cpu freq - will just pretend to do so\n");
1389 // The implementation would switch back to default policy and governor
1390 linux_restore_defaults();
1392 put_cpu_var(core_state);
1395 get_cpu_var(core_state).mode=V3_PSTATE_HOST_CONTROL;
1397 put_cpu_var(core_state);
1404 void palacios_pstate_ctrl_acquire(uint32_t type)
1406 if (get_cpu_var(core_state).mode != V3_PSTATE_HOST_CONTROL) {
1407 put_cpu_var(core_state);
1408 palacios_pstate_ctrl_release();
1410 put_cpu_var(core_state);
1414 case V3_PSTATE_EXTERNAL_CONTROL:
1415 switch_to_external();
1417 case V3_PSTATE_DIRECT_CONTROL:
1420 case V3_PSTATE_INTERNAL_CONTROL:
1421 switch_to_internal();
1424 ERROR("Unknown pstate control type %u\n",type);
1430 // Wrappers for xcalls
1431 static void palacios_pstate_ctrl_acquire_external(void)
1433 palacios_pstate_ctrl_acquire(V3_PSTATE_EXTERNAL_CONTROL);
1436 static void palacios_pstate_ctrl_acquire_direct(void)
1438 palacios_pstate_ctrl_acquire(V3_PSTATE_DIRECT_CONTROL);
1442 void palacios_pstate_ctrl_release(void)
1444 if (get_cpu_var(core_state).mode == V3_PSTATE_HOST_CONTROL) {
1445 put_cpu_var(core_state);
1448 put_cpu_var(core_state);
1450 switch (get_cpu_var(core_state).mode) {
1451 case V3_PSTATE_EXTERNAL_CONTROL:
1452 put_cpu_var(core_state);
1453 switch_from_external();
1455 case V3_PSTATE_DIRECT_CONTROL:
1456 put_cpu_var(core_state);
1457 switch_from_direct();
1459 case V3_PSTATE_INTERNAL_CONTROL:
1460 put_cpu_var(core_state);
1461 switch_from_internal();
1464 put_cpu_var(core_state);
1465 ERROR("Unknown pstate control type %u\n",core_state.mode);
1471 static void update_hw_pstate(void *arg)
1473 if (machine_state.funcs && machine_state.funcs->get_pstate) {
1474 get_cpu_var(core_state).cur_hw_pstate = machine_state.funcs->get_pstate();
1475 put_cpu_var(core_state);
1477 get_cpu_var(core_state).cur_hw_pstate = 0;
1478 put_cpu_var(core_state);
1483 /***************************************************************************
1484 PROC Interface to expose state
1485 ***************************************************************************/
1487 static int pstate_show(struct seq_file * file, void * v)
1490 unsigned int numcpus = num_online_cpus();
1492 seq_printf(file, "V3VEE DVFS Status\n\n");
1494 for (cpu=0;cpu<numcpus;cpu++) {
1495 palacios_xcall(cpu,update_hw_pstate,0);
1498 seq_printf(file, "Arch:\t%s\nPStates:\t%s\n\n",
1499 machine_state.arch==INTEL ? "Intel" :
1500 machine_state.arch==AMD ? "AMD" : "Other",
1501 machine_state.supports_pstates ? "Yes" : "No");
1503 for (cpu=0;cpu<numcpus;cpu++) {
1504 struct pstate_core_info *s = &per_cpu(core_state,cpu);
1505 seq_printf(file,"pcore %u: hw pstate 0x%x mode %s of [ host ",cpu,
1507 s->mode==V3_PSTATE_HOST_CONTROL ? "host" :
1508 s->mode==V3_PSTATE_EXTERNAL_CONTROL ? "external" :
1509 s->mode==V3_PSTATE_DIRECT_CONTROL ? "direct" :
1510 s->mode==V3_PSTATE_INTERNAL_CONTROL ? "internal" : "UNKNOWN");
1511 if (s->have_cpufreq) {
1512 seq_printf(file,"external ");
1514 if (machine_state.supports_pstates) {
1515 seq_printf(file,"direct ");
1517 seq_printf(file,"internal ] ");
1518 if (s->mode==V3_PSTATE_EXTERNAL_CONTROL) {
1519 seq_printf(file,"(min=%llu max=%llu cur=%llu) ", s->min_freq_khz, s->max_freq_khz, s->cur_freq_khz);
1521 if (s->mode==V3_PSTATE_DIRECT_CONTROL) {
1522 seq_printf(file,"(min=%u max=%u cur=%u) ", (uint32_t)s->min_pstate, (uint32_t)s->max_pstate, (uint32_t)s->cur_pstate);
1524 seq_printf(file,"\n");
1529 static int pstate_open(struct inode * inode, struct file * file)
1531 return single_open(file, pstate_show, NULL);
1535 static struct file_operations pstate_fops = {
1536 .owner = THIS_MODULE,
1537 .open = pstate_open,
1539 .llseek = seq_lseek,
1540 .release = seq_release
1543 int pstate_proc_setup(void)
1545 struct proc_dir_entry *proc;
1547 proc = create_proc_entry("v3-dvfs",0444, palacios_get_procdir());
1550 ERROR("Failed to create proc entry for p-state control\n");
1554 proc->proc_fops = &pstate_fops;
1559 void pstate_proc_teardown(void)
1561 remove_proc_entry("v3-dvfs",palacios_get_procdir());
1564 /********************************************************************
1565 User interface (ioctls)
1566 ********************************************************************/
1568 static int dvfs_ctrl(unsigned int cmd, unsigned long arg)
1570 struct v3_dvfs_ctrl_request r;
1572 if (copy_from_user(&r,(void __user*)arg,sizeof(struct v3_dvfs_ctrl_request))) {
1573 ERROR("Failed to copy DVFS request from user\n");
1577 if (r.pcore >= num_online_cpus()) {
1578 ERROR("Cannot apply DVFS request to pcore %u\n",r.pcore);
1583 case V3_DVFS_ACQUIRE: {
1584 switch (r.acq_type) {
1585 case V3_DVFS_EXTERNAL:
1586 palacios_xcall(r.pcore,(void (*)(void*))palacios_pstate_ctrl_acquire_external, NULL);
1589 case V3_DVFS_DIRECT:
1590 palacios_xcall(r.pcore,(void (*)(void*))palacios_pstate_ctrl_acquire_direct, NULL);
1594 ERROR("Unknown DVFS acquire type %u\n",r.acq_type);
1599 case V3_DVFS_RELEASE: {
1600 palacios_xcall(r.pcore,(void (*)(void*))palacios_pstate_ctrl_release, NULL);
1604 case V3_DVFS_SETFREQ: {
1605 palacios_xcall(r.pcore,(void (*)(void*))palacios_pstate_ctrl_set_freq,(void*)r.freq_khz);
1609 case V3_DVFS_SETPSTATE: {
1610 palacios_xcall(r.pcore,(void (*)(void*))palacios_pstate_ctrl_set_pstate_wrapper,(void*)(uint64_t)r.pstate);
1614 ERROR("Unknown DVFS command %u\n",r.cmd);
1622 void pstate_user_setup(void)
1624 add_global_ctrl(V3_DVFS_CTRL, dvfs_ctrl);
1628 void pstate_user_teardown(void)
1630 remove_global_ctrl(V3_DVFS_CTRL);
1633 static struct v3_host_pstate_ctrl_iface hooks = {
1634 .get_chars = palacios_pstate_ctrl_get_chars,
1635 .acquire = palacios_pstate_ctrl_acquire,
1636 .release = palacios_pstate_ctrl_release,
1637 .set_pstate = palacios_pstate_ctrl_set_pstate,
1638 .get_pstate = palacios_pstate_ctrl_get_pstate,
1639 .set_freq = palacios_pstate_ctrl_set_freq,
1640 .get_freq = palacios_pstate_ctrl_get_freq,
1645 static int pstate_ctrl_init(void)
1648 unsigned int numcpus = num_online_cpus();
1650 pstate_arch_setup();
1652 for (cpu=0;cpu<numcpus;cpu++) {
1653 palacios_xcall(cpu,(void ((*)(void*)))init_core,0);
1656 V3_Init_Pstate_Ctrl(&hooks);
1658 if (pstate_proc_setup()) {
1659 ERROR("Unable to initialize P-State Control\n");
1663 pstate_user_setup();
1665 pstate_linux_init();
1667 INFO("P-State Control Initialized\n");
1672 static int pstate_ctrl_deinit(void)
1675 unsigned int numcpus=num_online_cpus();
1677 pstate_linux_deinit();
1679 pstate_user_teardown();
1681 pstate_proc_teardown();
1683 // release pstate control if we have it, and we need to do this on each processor
1684 for (cpu=0;cpu<numcpus;cpu++) {
1685 palacios_xcall(cpu,(void (*)(void *))deinit_core,0);
1689 // Free any mapping table we built for Intel
1690 if (intel_pstate_to_ctrl && intel_pstate_to_ctrl != intel_pstate_to_ctrl_internal) {
1691 palacios_free(intel_pstate_to_ctrl);
1699 static struct linux_ext pstate_ext = {
1700 .name = "PSTATE_CTRL",
1701 .init = pstate_ctrl_init,
1702 .deinit = pstate_ctrl_deinit,
1704 .guest_deinit = NULL,
1708 register_extension(&pstate_ext);