2 * This file is part of the Palacios Virtual Machine Monitor developed
3 * by the V3VEE Project with funding from the United States National
4 * Science Foundation and the Department of Energy.
6 * The V3VEE Project is a joint project between Northwestern University
7 * and the University of New Mexico. You can find out more at
10 * Copyright (c) 2014, the V3VEE Project <http://www.v3vee.org>
11 * all rights reserved.
13 * Author: Kyle C. Hale <kh@u.northwestern.edu>
14 * Shiva Rao <shiva.rao.717@gmail.com>
15 * Peter Dinda <pdinda@northwestern.edu>
17 * This is free software. you are permitted to use,
18 * redistribute, and modify it as specified in the file "V3VEE_LICENSE".
21 #include <linux/uaccess.h>
22 #include <linux/seq_file.h>
23 #include <linux/proc_fs.h>
24 #include <linux/export.h>
25 #include <linux/cpufreq.h>
26 #include <linux/kernel.h>
27 #include <linux/kmod.h>
28 #include <linux/string.h>
29 #include <asm/processor.h>
31 #include <asm/msr-index.h>
33 #include <interfaces/vmm_pstate_ctrl.h>
36 #include "iface-pstate-ctrl.h"
38 #include "linux-exts.h"
41 This P-STATE control implementation includes:
43 - Direct control of Intel and AMD processor pstates
44 - External control of processor states via Linux (unimplemented)
45 - Internal control of processor states in Palacios (handoff from Linux)
47 Additionally, it provides a user-space interface for manipulating
48 p-state regardless of the host's functionality. This includes
49 an ioctl for commanding the implementation and a /proc file for
50 showing current status and capabilities.
55 #define PALACIOS_GOVNAME "v3vee"
56 #define MAX_PATH_LEN 128
57 #define MAX_GOV_NAME_LEN 16
60 struct pstate_core_info {
61 // Here we have the notion of host control
62 #define V3_PSTATE_HOST_CONTROL 0
63 // and all the modes from the Palacios interface:
64 // V3_PSTATE_EXTERNAL_CONTROL
65 // V3_PSTATE_DIRECT_CONTROL
66 // V3_PSTATE_INTERNAL_CONTROL
69 // Apply if we are under the DIRECT state
74 uint8_t cur_hw_pstate;
76 // Apply if we are under the EXTERNAL state
77 uint64_t cur_freq_khz;
78 uint64_t max_freq_khz;
79 uint64_t min_freq_khz;
82 uint8_t prior_speedstep;
83 uint8_t turbo_disabled;
88 // This is where we stash Linux's governor when we make a mode switch
89 char * linux_governor;
90 // We have this so we can restore the original frequency when we started
96 static DEFINE_PER_CPU(struct pstate_core_info, core_state);
100 // These are used to assert DIRECT control over the core pstates
101 struct pstate_core_funcs {
102 void (*arch_init)(void);
103 void (*arch_deinit)(void);
104 uint8_t (*get_min_pstate)(void);
105 uint8_t (*get_max_pstate)(void);
106 uint8_t (*get_pstate)(void);
107 void (*set_pstate)(uint8_t pstate);
110 struct pstate_machine_info {
111 enum {INTEL, AMD, OTHER } arch;
112 int supports_pstates;
122 int have_opportunistic; // this means "Turbo Boost" or "IDA"
123 int have_policy_hint;
124 int have_hwp; // hardware-controlled performance states
125 int have_hdc; // hardware duty cycling
126 int have_mwait_ext; // mwait power extensions
127 int have_mwait_int; // mwait wakes on interrupt
130 int have_pstate_hw_coord; // mperf/aperf
132 // used for DIRECT control
133 struct pstate_core_funcs *funcs;
137 static struct pstate_machine_info machine_state;
140 /****************************************************
142 ***************************************************/
144 /* AMD Programmer's Manual Vol 2 (Rev 3, 2013), Sec. 17.1, pp.557 */
145 #define MSR_PSTATE_LIMIT_REG_AMD 0xc0010061
146 #define MSR_PSTATE_CTL_REG_AMD 0xc0010062
147 #define MSR_PSTATE_STAT_REG_AMD 0xc0010063
149 struct p_state_limit_reg_amd {
153 uint8_t pstate_limit : 4; /* lowest P-state value (highest perf.) supported currently (this can change at runtime) */
154 uint8_t pstate_max : 4; /* highest P-state value supported (lowest perf) */
157 } __attribute__((packed));
158 } __attribute__((packed));
161 struct p_state_stat_reg_amd {
168 } __attribute__((packed));
169 } __attribute__((packed));
172 struct p_state_ctl_reg_amd {
179 } __attribute__((packed));
180 } __attribute__((packed));
183 /* CPUID Fn8000_0007_EDX[HwPstate(7)] = 1 */
184 static uint8_t supports_pstates_amd (void)
186 uint32_t eax, ebx, ecx, edx;
188 cpuid(0x80000007, &eax, &ebx, &ecx, &edx);
189 machine_state.have_pstate = !!(edx & (1 << 7));
190 machine_state.have_coreboost = !!(edx & (1<<9));
191 machine_state.have_feedback = !!(edx & (1<<11));
193 cpuid(0x6, &eax, &ebx, &ecx, &edx);
194 machine_state.have_pstate_hw_coord = !!(ecx & 1);
196 INFO("P-State: AMD: Pstates=%d Coreboost=%d Feedback=%d PstateHWCoord=%d\n",
197 machine_state.have_pstate,
198 machine_state.have_coreboost,
199 machine_state.have_feedback,
200 machine_state.have_pstate_hw_coord);
202 return machine_state.have_pstate;
208 static void init_arch_amd(void)
210 /* KCH: nothing to do here */
214 static void deinit_arch_amd(void)
216 /* KCH: nothing to do here */
220 static uint8_t get_pstate_amd(void)
222 struct p_state_stat_reg_amd pstat;
224 rdmsrl(MSR_PSTATE_STAT_REG_AMD, pstat.val);
226 get_cpu_var(core_state).cur_pstate=pstat.reg.pstate;
227 put_cpu_var(core_state);
229 return pstat.reg.pstate;
233 static void set_pstate_amd(uint8_t p)
235 struct p_state_ctl_reg_amd pctl;
239 wrmsrl(MSR_PSTATE_CTL_REG_AMD, pctl.val);
241 get_cpu_var(core_state).cur_pstate=p;
242 put_cpu_var(core_state);
247 * NOTE: HW may change this value at runtime
249 static uint8_t get_max_pstate_amd(void)
251 struct p_state_limit_reg_amd plimits;
253 rdmsrl(MSR_PSTATE_LIMIT_REG_AMD, plimits.val);
255 return plimits.reg.pstate_max;
259 static uint8_t get_min_pstate_amd(void)
261 struct p_state_limit_reg_amd plimits;
263 rdmsrl(MSR_PSTATE_LIMIT_REG_AMD, plimits.val);
265 return plimits.reg.pstate_limit;
269 static struct pstate_core_funcs amd_funcs =
271 .arch_init = init_arch_amd,
272 .arch_deinit = deinit_arch_amd,
273 .get_pstate = get_pstate_amd,
274 .set_pstate = set_pstate_amd,
275 .get_max_pstate = get_max_pstate_amd,
276 .get_min_pstate = get_min_pstate_amd,
281 /***********************************************************
283 **********************************************************/
287 This implementation uses SpeedStep, but does check
288 to see if the other features (MPERF/APERF, Turbo/IDA, HWP)
292 /* Intel System Programmer's Manual Vol. 3B, 14-2 */
293 #define MSR_MPERF_IA32 0x000000e7
294 #define MSR_APERF_IA32 0x000000e8
295 #define MSR_MISC_ENABLE_IA32 0x000001a0
296 #define MSR_NHM_TURBO_RATIO_LIMIT 0x000001ad
297 #define MSR_PLATFORM_INFO_IA32 0x000000ce
298 #define MSR_PERF_CTL_IA32 0x00000199
299 #define MSR_PERF_STAT_IA32 0x00000198
300 #define MSR_ENERY_PERF_BIAS_IA32 0x000001b0
303 /* Note that the actual meaning of the pstate
304 in the control and status registers is actually
305 implementation dependent, unlike AMD. The "official"
306 way to figure it out the mapping from pstate to
307 these values is via ACPI. What is written in the register
308 is an "id" of an operation point
310 "Often", the 16 bit field consists of a high order byte
311 which is the frequency (the multiplier) and the low order
314 // MSR_PERF_CTL_IA32 r/w
315 struct perf_ctl_reg_intel {
319 // This is the target
320 // Note, not the ACPI pstate, but
321 // Intel's notion of pstate is that it's opaque
322 // for lots of implementations it seems to be
323 // frequency_id : voltage_id
324 // where frequency_id is typically the multiplier
325 uint16_t pstate : 16;
326 uint16_t reserved : 16;
327 // set to 1 to *disengage* dynamic acceleration
328 // Note that "IDA" and "Turbo" use the same interface
329 uint16_t dynamic_accel_disable : 1;
330 uint32_t reserved2 : 31;
332 } __attribute__((packed));
333 } __attribute__((packed));
335 // MSR_PERF_STAT_IA32 r
336 struct perf_stat_reg_intel {
340 // this is the current
341 uint16_t pstate : 16;
342 uint64_t reserved : 48;
344 } __attribute__((packed));
345 } __attribute__((packed));
347 // MSR_ENERGY_PERF_BIAS_IA32 r/w
348 struct enery_perf_bias_reg_intel {
352 // this is the current
353 uint8_t policy_hint : 4;
354 uint64_t reserved : 60;
356 } __attribute__((packed));
357 } __attribute__((packed));
360 struct turbo_mode_info_reg_intel {
365 uint8_t max_noturbo_ratio : 8;
367 uint8_t ppin_cap : 1;
369 uint8_t ratio_limit : 1;
370 uint8_t tdc_tdp_limit : 1;
372 uint8_t min_ratio : 8;
375 } __attribute__((packed));
376 } __attribute__((packed));
379 /* CPUID.01:ECX.AES(7) */
380 static uint8_t supports_pstates_intel(void)
382 /* NOTE: CPUID.06H:ECX.SETBH[bit 3] is set and it also implies the presence of a new architectural MSR called IA32_ENERGY_PERF_BIAS (1B0H).
384 uint32_t eax, ebx, ecx, edx;
386 cpuid(0x1, &eax, &ebx, &ecx, &edx);
387 machine_state.have_speedstep = !!(ecx & (1 << 7));
389 cpuid(0x6, &eax, &ebx, &ecx, &edx);
390 machine_state.have_pstate_hw_coord = !!(ecx & 1); // ?
391 machine_state.have_opportunistic = !!(eax & 1<<1);
392 machine_state.have_policy_hint = !!(ecx & 1<<3);
393 machine_state.have_hwp = !!(eax & 1<<7);
394 machine_state.have_hdc = !!(eax & 1<<13);
396 cpuid(0x5, &eax, &ebx, &ecx, &edx);
397 machine_state.have_mwait_ext = !!(ecx & 1);
398 machine_state.have_mwait_int = !!(ecx & 1<<1);
401 INFO("P-State: Intel: Speedstep=%d, PstateHWCoord=%d, Opportunistic=%d PolicyHint=%d HWP=%d HDC=%d, MwaitExt=%d MwaitInt=%d \n",
402 machine_state.have_speedstep,
403 machine_state.have_pstate_hw_coord,
404 machine_state.have_opportunistic,
405 machine_state.have_policy_hint,
406 machine_state.have_hwp,
407 machine_state.have_hdc,
408 machine_state.have_mwait_ext,
409 machine_state.have_mwait_int );
411 return machine_state.have_speedstep;
415 static void init_arch_intel(void)
419 rdmsrl(MSR_MISC_ENABLE_IA32, val);
421 // store prior speedstep setting
422 get_cpu_var(core_state).prior_speedstep=(val >> 16) & 0x1;
423 put_cpu_var(core_state);
425 // enable speedstep (probably already on)
427 wrmsrl(MSR_MISC_ENABLE_IA32, val);
431 static void deinit_arch_intel(void)
435 rdmsrl(MSR_MISC_ENABLE_IA32, val);
437 val &= ~(1ULL << 16);
438 val |= get_cpu_var(core_state).prior_speedstep << 16;
439 put_cpu_var(core_state);
441 wrmsrl(MSR_MISC_ENABLE_IA32, val);
445 /* TODO: Intel P-states require sampling at intervals... */
446 static uint8_t get_pstate_intel(void)
451 rdmsrl(MSR_PERF_STAT_IA32,val);
453 pstate = val & 0xffff;
455 INFO("P-State: Get: 0x%llx\n", val);
457 // Assume top byte is the FID
458 //if (pstate & 0xff ) {
459 // ERROR("P-State: Intel returns confusing pstate %u\n",pstate);
462 // should check if turbo is active, in which case
463 // this value is not the whole story
465 return (uint8_t) (pstate>>8);
468 static void set_pstate_intel(uint8_t p)
472 /* ...Intel IDA (dynamic acceleration)
473 if (c->no_turbo && !c->turbo_disabled) {
477 // leave all bits along expect for the likely
480 rdmsrl(MSR_PERF_CTL_IA32, val);
482 val |= ((uint64_t)p)<<8;
484 INFO("P-State: Set: 0x%llx\n", val);
486 wrmsrl(MSR_PERF_CTL_IA32, val);
488 get_cpu_var(core_state).cur_pstate = p;
489 put_cpu_var(core_state);
493 static uint8_t get_min_pstate_intel(void)
495 struct turbo_mode_info_reg_intel t;
497 rdmsrl(MSR_PLATFORM_INFO_IA32, t.val);
499 return t.reg.min_ratio;
504 static uint8_t get_max_pstate_intel (void)
506 struct turbo_mode_info_reg_intel t;
508 rdmsrl(MSR_PLATFORM_INFO_IA32, t.val);
510 return t.reg.max_noturbo_ratio;
513 static struct pstate_core_funcs intel_funcs =
515 .arch_init = init_arch_intel,
516 .arch_deinit = deinit_arch_intel,
517 .get_pstate = get_pstate_intel,
518 .set_pstate = set_pstate_intel,
519 .get_max_pstate = get_max_pstate_intel,
520 .get_min_pstate = get_min_pstate_intel,
525 /***********************************************
526 Arch determination and setup
527 ***********************************************/
529 static inline void cpuid_string (uint32_t id, uint32_t dest[4])
532 :"=a"(*dest),"=b"(*(dest+1)),"=c"(*(dest+2)),"=d"(*(dest+3))
537 static int get_cpu_vendor (char name[13])
542 cpuid_string(0,dest);
544 ((uint32_t*)name)[0]=dest[1];
545 ((uint32_t*)name)[1]=dest[3];
546 ((uint32_t*)name)[2]=dest[2];
553 static int is_intel (void)
556 get_cpu_vendor(name);
557 return !strcmp(name,"GenuineIntel");
561 static int is_amd (void)
564 get_cpu_vendor(name);
565 return !strcmp(name,"AuthenticAMD");
568 static int pstate_arch_setup(void)
572 machine_state.arch = AMD;
573 machine_state.funcs = &amd_funcs;
574 machine_state.supports_pstates = supports_pstates_amd();
575 INFO("PSTATE: P-State initialized for AMD\n");
576 } else if (is_intel()) {
577 machine_state.arch = INTEL;
578 machine_state.funcs = &intel_funcs;
579 machine_state.supports_pstates = supports_pstates_intel();
580 INFO("PSTATE: P-State initialized for INTEL (Work in progress...)\n");
584 machine_state.arch = OTHER;
585 machine_state.funcs = NULL;
586 machine_state.supports_pstates = 0;
587 INFO("PSTATE: P-state control: No support for direct control on this architecture\n");
596 /******************************************************************
598 *****************************************************************/
602 * This stub governor is simply a placeholder for preventing
603 * frequency changes from the Linux side. For now, we simply leave
604 * the frequency as is when we acquire control.
606 static int governor_run(struct cpufreq_policy *policy, unsigned int event)
610 /* we can't use cpufreq_driver_target here as it can result
611 * in a circular dependency, so we'll just do nothing.
613 case CPUFREQ_GOV_START:
614 case CPUFREQ_GOV_STOP:
615 case CPUFREQ_GOV_LIMITS:
619 ERROR("Undefined governor command\n");
627 static struct cpufreq_governor stub_governor =
629 .name = PALACIOS_GOVNAME,
630 .governor = governor_run,
631 .owner = THIS_MODULE,
635 static inline void pstate_register_linux_governor(void)
637 cpufreq_register_governor(&stub_governor);
641 static inline void pstate_unregister_linux_governor(void)
643 cpufreq_unregister_governor(&stub_governor);
647 static int get_current_governor(char **buf, unsigned int cpu)
649 struct cpufreq_policy * policy = palacios_alloc(sizeof(struct cpufreq_policy));
650 char * govname = NULL;
653 ERROR("could not allocate cpufreq_policy\n");
657 if (cpufreq_get_policy(policy, cpu) != 0) {
658 ERROR("Could not get current cpufreq policy\n");
662 /* We're in interrupt context, should probably not wait here */
663 govname = palacios_alloc(MAX_GOV_NAME_LEN);
665 ERROR("Could not allocate space for governor name\n");
669 strncpy(govname, policy->governor->name, MAX_GOV_NAME_LEN);
671 get_cpu_var(core_state).linux_governor = govname;
672 put_cpu_var(core_state);
676 palacios_free(policy);
681 palacios_free(policy);
686 /* passed to the userspacehelper interface for cleanup */
687 static void gov_switch_cleanup(struct subprocess_info * s)
689 palacios_free(s->argv[2]);
690 palacios_free(s->argv);
696 * @s - the governor to switch to
698 static int governor_switch(char * s, unsigned int cpu)
700 char * path_str = NULL;
703 static char * envp[] = {
706 "PATH=/sbin:/bin:/usr/sbin:/usr/bin", NULL };
709 argv = palacios_alloc(4*sizeof(char*));
711 ERROR("Couldn't allocate argv struct\n");
715 path_str = palacios_alloc(MAX_PATH_LEN);
717 ERROR("Couldn't allocate path string\n");
720 memset(path_str, 0, MAX_PATH_LEN);
722 snprintf(path_str, MAX_PATH_LEN, "echo %s > /sys/devices/system/cpu/cpu%u/cpufreq/scaling_governor", s, cpu);
729 /* KCH: we can't wait here to actually see if we succeeded, we're in interrupt context */
730 return call_usermodehelper_fns("/bin/sh", argv, envp, UMH_NO_WAIT, NULL, gov_switch_cleanup, NULL);
738 static inline void free_linux_governor(void)
740 palacios_free(get_cpu_var(core_state).linux_governor);
741 put_cpu_var(core_state);
745 static int linux_setup_palacios_governor(void)
748 unsigned int cpu = get_cpu();
750 /* KCH: we assume the v3vee governor is already
751 * registered with kernel by this point
754 if (get_current_governor(&gov, cpu) < 0) {
755 ERROR("Could not get current governor\n");
759 DEBUG("saving current governor (%s)\n", gov);
761 get_cpu_var(core_state).linux_governor = gov;
762 put_cpu_var(core_state);
764 DEBUG("setting the new governor (%s)\n", PALACIOS_GOVNAME);
766 /* set the new one to ours */
767 if (governor_switch(PALACIOS_GOVNAME, cpu) < 0) {
768 ERROR("Could not set governor to (%s)\n", PALACIOS_GOVNAME);
777 static int linux_deinit(void)
784 static int linux_get_pstate(void)
786 struct cpufreq_policy * policy = NULL;
787 struct cpufreq_frequency_table *table;
790 unsigned int count = 0;
792 policy = palacios_alloc(sizeof(struct cpufreq_policy));
794 ERROR("Could not allocate policy struct\n");
798 cpufreq_get_policy(policy, cpu);
799 table = cpufreq_frequency_get_table(cpu);
801 for (i = 0; (table[i].frequency != CPUFREQ_TABLE_END); i++) {
803 if (table[i].frequency == CPUFREQ_ENTRY_INVALID) {
807 if (table[i].frequency == policy->cur) {
814 palacios_free(policy);
819 static int linux_get_freq(void)
821 struct cpufreq_policy * policy = NULL;
824 policy = palacios_alloc(sizeof(struct cpufreq_policy));
826 ERROR("Could not allocate policy struct\n");
830 if (cpufreq_get_policy(policy, cpu)) {
831 ERROR("Could not get current policy\n");
839 static int linux_set_pstate(uint8_t p)
841 struct cpufreq_policy * policy = NULL;
842 struct cpufreq_frequency_table *table;
845 unsigned int count = 0;
849 policy = palacios_alloc(sizeof(struct cpufreq_policy));
851 ERROR("Could not allocate policy struct\n");
855 if (cpufreq_get_policy(policy, cpu)) {
856 ERROR("Could not get current policy\n");
859 table = cpufreq_frequency_get_table(cpu);
861 for (i = 0; (table[i].frequency != CPUFREQ_TABLE_END); i++) {
863 if (table[i].frequency == CPUFREQ_ENTRY_INVALID) {
868 cpufreq_driver_target(policy, table[i].frequency, CPUFREQ_RELATION_H);
876 /* we need to deal with the case in which we get a number > max pstate */
878 cpufreq_driver_target(policy, table[last_valid].frequency, CPUFREQ_RELATION_H);
881 palacios_free(policy);
885 palacios_free(policy);
890 static int linux_set_freq(uint64_t f)
892 struct cpufreq_policy * policy = NULL;
896 policy = palacios_alloc(sizeof(struct cpufreq_policy));
898 ERROR("Could not allocate policy struct\n");
902 cpufreq_get_policy(policy, cpu);
904 if (f < policy->min) {
906 } else if (f > policy->max) {
912 cpufreq_driver_target(policy, freq, CPUFREQ_RELATION_H);
914 palacios_free(policy);
919 static int linux_restore_defaults(void)
921 unsigned int cpu = get_cpu();
924 gov = get_cpu_var(core_state).linux_governor;
925 put_cpu_var(core_state);
927 DEBUG("restoring previous governor (%s)\n", gov);
929 if (governor_switch(gov, cpu) < 0) {
930 ERROR("Could not restore governor to (%s)\n", gov);
934 free_linux_governor();
938 free_linux_governor();
944 /******************************************************************
945 Generic Interface as provided to Palacios and to the rest of the
947 ******************************************************************/
949 static void init_core(void)
952 struct cpufreq_policy *p;
955 DEBUG("P-State Core Init\n");
957 get_cpu_var(core_state).mode = V3_PSTATE_HOST_CONTROL;
958 get_cpu_var(core_state).cur_pstate = 0;
960 if (machine_state.funcs) {
961 get_cpu_var(core_state).min_pstate = machine_state.funcs->get_min_pstate();
962 get_cpu_var(core_state).max_pstate = machine_state.funcs->get_max_pstate();
964 get_cpu_var(core_state).min_pstate = 0;
965 get_cpu_var(core_state).max_pstate = 0;
969 cpu = get_cpu(); put_cpu();
971 p = cpufreq_cpu_get(cpu);
974 get_cpu_var(core_state).have_cpufreq = 0;
975 get_cpu_var(core_state).min_freq_khz=0;
976 get_cpu_var(core_state).max_freq_khz=0;
977 get_cpu_var(core_state).cur_freq_khz=0;
979 get_cpu_var(core_state).have_cpufreq = 1;
980 get_cpu_var(core_state).min_freq_khz=p->min;
981 get_cpu_var(core_state).max_freq_khz=p->max;
982 get_cpu_var(core_state).cur_freq_khz=p->cur;
986 put_cpu_var(core_state);
991 void palacios_pstate_ctrl_release(void);
994 static void deinit_core(void)
997 DEBUG("P-State Core Deinit\n");
999 palacios_pstate_ctrl_release();
1004 void palacios_pstate_ctrl_get_chars(struct v3_cpu_pstate_chars *c)
1006 memset(c,0,sizeof(struct v3_cpu_pstate_chars));
1009 c->features = V3_PSTATE_INTERNAL_CONTROL;
1011 if (get_cpu_var(core_state).have_cpufreq) {
1012 c->features |= V3_PSTATE_EXTERNAL_CONTROL;
1015 if (machine_state.arch==AMD || machine_state.arch==INTEL) {
1016 c->features |= V3_PSTATE_DIRECT_CONTROL;
1018 c->cur_mode = get_cpu_var(core_state).mode;
1019 c->min_pstate = get_cpu_var(core_state).min_pstate;
1020 c->max_pstate = get_cpu_var(core_state).max_pstate;
1021 c->cur_pstate = get_cpu_var(core_state).cur_pstate;
1022 c->min_freq_khz = get_cpu_var(core_state).min_freq_khz;
1023 c->max_freq_khz = get_cpu_var(core_state).max_freq_khz;
1024 c->cur_freq_khz = get_cpu_var(core_state).cur_freq_khz;
1026 put_cpu_var(core_state);
1033 uint8_t palacios_pstate_ctrl_get_pstate(void)
1035 if (get_cpu_var(core_state).mode==V3_PSTATE_DIRECT_CONTROL) {
1036 put_cpu_var(core_state);
1037 return machine_state.funcs->get_pstate();
1038 } else if (get_cpu_var(core_state).mode==V3_PSTATE_EXTERNAL_CONTROL) {
1039 put_cpu_var(core_state);
1040 return linux_get_pstate();
1042 put_cpu_var(core_state);
1048 void palacios_pstate_ctrl_set_pstate(uint8_t p)
1050 if (get_cpu_var(core_state).mode==V3_PSTATE_DIRECT_CONTROL) {
1051 put_cpu_var(core_state);
1052 machine_state.funcs->set_pstate(p);
1053 } else if (get_cpu_var(core_state).mode==V3_PSTATE_EXTERNAL_CONTROL) {
1054 put_cpu_var(core_state);
1055 linux_set_pstate(p);
1060 void palacios_pstate_ctrl_set_pstate_wrapper(void *p)
1062 palacios_pstate_ctrl_set_pstate((uint8_t)(uint64_t)p);
1066 uint64_t palacios_pstate_ctrl_get_freq(void)
1068 if (get_cpu_var(core_state).mode==V3_PSTATE_EXTERNAL_CONTROL) {
1069 put_cpu_var(core_state);
1070 return linux_get_freq();
1072 put_cpu_var(core_state);
1078 void palacios_pstate_ctrl_set_freq(uint64_t p)
1080 if (get_cpu_var(core_state).mode==V3_PSTATE_EXTERNAL_CONTROL) {
1081 put_cpu_var(core_state);
1084 put_cpu_var(core_state);
1088 static int switch_to_external(void)
1090 if (!(get_cpu_var(core_state).have_cpufreq)) {
1091 put_cpu_var(core_state);
1092 ERROR("No cpufreq - cannot switch to external...\n");
1095 put_cpu_var(core_state);
1097 DEBUG("Switching to external control\n");
1098 return linux_restore_defaults();
1102 static int switch_to_direct(void)
1104 if (get_cpu_var(core_state).have_cpufreq) {
1105 put_cpu_var(core_state);
1106 DEBUG("switch to direct from cpufreq\n");
1108 // The implementation would set the policy and governor to peg cpu
1109 // regardless of load
1110 linux_setup_palacios_governor();
1113 if (machine_state.funcs && machine_state.funcs->arch_init) {
1114 get_cpu_var(core_state).mode=V3_PSTATE_DIRECT_CONTROL;
1116 machine_state.funcs->arch_init();
1118 put_cpu_var(core_state);
1125 static int switch_to_internal(void)
1127 if (get_cpu_var(core_state).have_cpufreq) {
1128 put_cpu_var(core_state);
1129 DEBUG("switch to internal on machine with cpu freq\n");
1130 linux_setup_palacios_governor();
1133 get_cpu_var(core_state).mode=V3_PSTATE_INTERNAL_CONTROL;
1135 put_cpu_var(core_state);
1141 static int switch_from_external(void)
1143 if (!(get_cpu_var(core_state).have_cpufreq)) {
1144 put_cpu_var(core_state);
1145 ERROR("No cpufreq - how did we get here... external...\n");
1149 DEBUG("Switching from external...\n");
1150 linux_restore_defaults();
1152 get_cpu_var(core_state).mode = V3_PSTATE_HOST_CONTROL;
1154 put_cpu_var(core_state);
1160 static int switch_from_direct(void)
1162 if (get_cpu_var(core_state).have_cpufreq) {
1163 put_cpu_var(core_state);
1164 DEBUG("Switching back to cpufreq control from direct\n");
1165 linux_restore_defaults();
1168 get_cpu_var(core_state).mode=V3_PSTATE_HOST_CONTROL;
1170 machine_state.funcs->set_pstate(get_cpu_var(core_state).min_pstate);
1172 machine_state.funcs->arch_deinit();
1174 put_cpu_var(core_state);
1180 static int switch_from_internal(void)
1182 if (get_cpu_var(core_state).have_cpufreq) {
1183 put_cpu_var(core_state);
1184 ERROR("Unimplemented: switch from internal on machine with cpu freq - will just pretend to do so\n");
1185 // The implementation would switch back to default policy and governor
1186 linux_restore_defaults();
1189 get_cpu_var(core_state).mode=V3_PSTATE_HOST_CONTROL;
1191 put_cpu_var(core_state);
1198 void palacios_pstate_ctrl_acquire(uint32_t type)
1200 if (get_cpu_var(core_state).mode != V3_PSTATE_HOST_CONTROL) {
1201 palacios_pstate_ctrl_release();
1204 put_cpu_var(core_state);
1207 case V3_PSTATE_EXTERNAL_CONTROL:
1208 switch_to_external();
1210 case V3_PSTATE_DIRECT_CONTROL:
1213 case V3_PSTATE_INTERNAL_CONTROL:
1214 switch_to_internal();
1217 ERROR("Unknown pstate control type %u\n",type);
1223 // Wrappers for xcalls
1224 static void palacios_pstate_ctrl_acquire_external(void)
1226 palacios_pstate_ctrl_acquire(V3_PSTATE_EXTERNAL_CONTROL);
1229 static void palacios_pstate_ctrl_acquire_direct(void)
1231 palacios_pstate_ctrl_acquire(V3_PSTATE_DIRECT_CONTROL);
1235 void palacios_pstate_ctrl_release(void)
1237 if (get_cpu_var(core_state).mode == V3_PSTATE_HOST_CONTROL) {
1238 put_cpu_var(core_state);
1242 switch (get_cpu_var(core_state).mode) {
1243 case V3_PSTATE_EXTERNAL_CONTROL:
1244 switch_from_external();
1246 case V3_PSTATE_DIRECT_CONTROL:
1247 switch_from_direct();
1249 case V3_PSTATE_INTERNAL_CONTROL:
1250 switch_from_internal();
1253 ERROR("Unknown pstate control type %u\n",core_state.mode);
1257 put_cpu_var(core_state);
1262 static void update_hw_pstate(void *arg)
1264 if (machine_state.funcs && machine_state.funcs->get_pstate) {
1265 get_cpu_var(core_state).cur_hw_pstate = machine_state.funcs->get_pstate();
1266 put_cpu_var(core_state);
1268 get_cpu_var(core_state).cur_hw_pstate = 0;
1269 put_cpu_var(core_state);
1274 /***************************************************************************
1275 PROC Interface to expose state
1276 ***************************************************************************/
1278 static int pstate_show(struct seq_file * file, void * v)
1281 unsigned int numcpus = num_online_cpus();
1283 seq_printf(file, "V3VEE DVFS Status\n\n");
1285 for (cpu=0;cpu<numcpus;cpu++) {
1286 palacios_xcall(cpu,update_hw_pstate,0);
1289 seq_printf(file, "Arch:\t%s\nPStates:\t%s\n\n",
1290 machine_state.arch==INTEL ? "Intel" :
1291 machine_state.arch==AMD ? "AMD" : "Other",
1292 machine_state.supports_pstates ? "Yes" : "No");
1294 for (cpu=0;cpu<numcpus;cpu++) {
1295 struct pstate_core_info *s = &per_cpu(core_state,cpu);
1296 seq_printf(file,"pcore %u: hw pstate %u mode %s of [ host ",cpu,
1298 s->mode==V3_PSTATE_HOST_CONTROL ? "host" :
1299 s->mode==V3_PSTATE_EXTERNAL_CONTROL ? "external" :
1300 s->mode==V3_PSTATE_DIRECT_CONTROL ? "direct" :
1301 s->mode==V3_PSTATE_INTERNAL_CONTROL ? "internal" : "UNKNOWN");
1302 if (s->have_cpufreq) {
1303 seq_printf(file,"external ");
1305 if (machine_state.supports_pstates) {
1306 seq_printf(file,"direct ");
1308 seq_printf(file,"internal ] ");
1309 if (s->mode==V3_PSTATE_EXTERNAL_CONTROL) {
1310 seq_printf(file,"(min=%llu max=%llu cur=%llu) ", s->min_freq_khz, s->max_freq_khz, s->cur_freq_khz);
1312 if (s->mode==V3_PSTATE_DIRECT_CONTROL) {
1313 seq_printf(file,"(min=%u max=%u cur=%u) ", (uint32_t)s->min_pstate, (uint32_t)s->max_pstate, (uint32_t)s->cur_pstate);
1315 seq_printf(file,"\n");
1320 static int pstate_open(struct inode * inode, struct file * file)
1322 return single_open(file, pstate_show, NULL);
1326 static struct file_operations pstate_fops = {
1327 .owner = THIS_MODULE,
1328 .open = pstate_open,
1330 .llseek = seq_lseek,
1331 .release = seq_release
1334 int pstate_proc_setup(void)
1336 struct proc_dir_entry *proc;
1338 proc = create_proc_entry("v3-dvfs",0444, palacios_get_procdir());
1341 ERROR("Failed to create proc entry for p-state control\n");
1345 proc->proc_fops = &pstate_fops;
1350 void pstate_proc_teardown(void)
1352 remove_proc_entry("v3-dvfs",palacios_get_procdir());
1355 /********************************************************************
1356 User interface (ioctls)
1357 ********************************************************************/
1359 static int dvfs_ctrl(unsigned int cmd, unsigned long arg)
1361 struct v3_dvfs_ctrl_request r;
1363 if (copy_from_user(&r,(void __user*)arg,sizeof(struct v3_dvfs_ctrl_request))) {
1364 ERROR("Failed to copy DVFS request from user\n");
1368 if (r.pcore >= num_online_cpus()) {
1369 ERROR("Cannot apply DVFS request to pcore %u\n",r.pcore);
1374 case V3_DVFS_ACQUIRE: {
1375 switch (r.acq_type) {
1376 case V3_DVFS_EXTERNAL:
1377 palacios_xcall(r.pcore,(void (*)(void*))palacios_pstate_ctrl_acquire_external, NULL);
1380 case V3_DVFS_DIRECT:
1381 palacios_xcall(r.pcore,(void (*)(void*))palacios_pstate_ctrl_acquire_direct, NULL);
1385 ERROR("Unknown DVFS acquire type %u\n",r.acq_type);
1390 case V3_DVFS_RELEASE: {
1391 palacios_xcall(r.pcore,(void (*)(void*))palacios_pstate_ctrl_release, NULL);
1395 case V3_DVFS_SETFREQ: {
1396 palacios_xcall(r.pcore,(void (*)(void*))palacios_pstate_ctrl_set_freq,(void*)r.freq_khz);
1400 case V3_DVFS_SETPSTATE: {
1401 palacios_xcall(r.pcore,(void (*)(void*))palacios_pstate_ctrl_set_pstate_wrapper,(void*)(uint64_t)r.pstate);
1405 ERROR("Unknown DVFS command %u\n",r.cmd);
1413 void pstate_user_setup(void)
1415 add_global_ctrl(V3_DVFS_CTRL, dvfs_ctrl);
1419 void pstate_user_teardown(void)
1421 remove_global_ctrl(V3_DVFS_CTRL);
1424 static struct v3_host_pstate_ctrl_iface hooks = {
1425 .get_chars = palacios_pstate_ctrl_get_chars,
1426 .acquire = palacios_pstate_ctrl_acquire,
1427 .release = palacios_pstate_ctrl_release,
1428 .set_pstate = palacios_pstate_ctrl_set_pstate,
1429 .get_pstate = palacios_pstate_ctrl_get_pstate,
1430 .set_freq = palacios_pstate_ctrl_set_freq,
1431 .get_freq = palacios_pstate_ctrl_get_freq,
1436 static int pstate_ctrl_init(void)
1439 unsigned int numcpus = num_online_cpus();
1441 pstate_arch_setup();
1443 for (cpu=0;cpu<numcpus;cpu++) {
1444 palacios_xcall(cpu,(void ((*)(void*)))init_core,0);
1447 V3_Init_Pstate_Ctrl(&hooks);
1449 if (pstate_proc_setup()) {
1450 ERROR("Unable to initialize P-State Control\n");
1454 pstate_user_setup();
1456 pstate_register_linux_governor();
1458 INFO("P-State Control Initialized\n");
1463 static int pstate_ctrl_deinit(void)
1466 unsigned int numcpus=num_online_cpus();
1468 pstate_unregister_linux_governor();
1470 pstate_user_teardown();
1472 pstate_proc_teardown();
1474 // release pstate control if we have it, and we need to do this on each processor
1475 for (cpu=0;cpu<numcpus;cpu++) {
1476 palacios_xcall(cpu,(void (*)(void *))deinit_core,0);
1483 static struct linux_ext pstate_ext = {
1484 .name = "PSTATE_CTRL",
1485 .init = pstate_ctrl_init,
1486 .deinit = pstate_ctrl_deinit,
1488 .guest_deinit = NULL,
1492 register_extension(&pstate_ext);