2 * This file is part of the Palacios Virtual Machine Monitor developed
3 * by the V3VEE Project with funding from the United States National
4 * Science Foundation and the Department of Energy.
6 * The V3VEE Project is a joint project between Northwestern University
7 * and the University of New Mexico. You can find out more at
10 * Copyright (c) 2014, the V3VEE Project <http://www.v3vee.org>
11 * all rights reserved.
13 * Author: Kyle C. Hale <kh@u.northwestern.edu>
14 * Shiva Rao <shiva.rao.717@gmail.com>
15 * Peter Dinda <pdinda@northwestern.edu>
17 * This is free software. you are permitted to use,
18 * redistribute, and modify it as specified in the file "V3VEE_LICENSE".
21 #include <linux/uaccess.h>
22 #include <linux/seq_file.h>
23 #include <linux/proc_fs.h>
24 #include <linux/cpufreq.h>
25 #include <linux/kernel.h>
26 #include <linux/kmod.h>
27 #include <linux/string.h>
28 #include <asm/processor.h>
30 #include <asm/msr-index.h>
32 // Used to determine the appropriate pstates values on Intel
33 #include <linux/acpi.h>
34 #include <acpi/processor.h>
36 #include <interfaces/vmm_pstate_ctrl.h>
39 #include "iface-pstate-ctrl.h"
41 #include "linux-exts.h"
44 This P-STATE control implementation includes:
46 - Direct control of Intel and AMD processor pstates
47 - External control of processor states via Linux (unimplemented)
48 - Internal control of processor states in Palacios (handoff from Linux)
50 Additionally, it provides a user-space interface for manipulating
51 p-state regardless of the host's functionality. This includes
52 an ioctl for commanding the implementation and a /proc file for
53 showing current status and capabilities.
55 What we mean by "pstate" here is the processor's internal
56 configuration. For AMD, this is defined as being the same as
57 the ACPI-defined p-state. For Intel, it is not. There, it is the
58 contents of the perf ctl MSR, which, often, is the frequency id
59 and voltage id (the multipliers).
64 #define PALACIOS_GOVNAME "v3vee"
65 #define MAX_PATH_LEN 128
66 #define MAX_GOV_NAME_LEN 16
69 struct pstate_core_info {
70 // Here we have the notion of host control
71 #define V3_PSTATE_HOST_CONTROL 0
72 // and all the modes from the Palacios interface:
73 // V3_PSTATE_EXTERNAL_CONTROL
74 // V3_PSTATE_DIRECT_CONTROL
75 // V3_PSTATE_INTERNAL_CONTROL
78 // Apply if we are under the DIRECT state
83 uint8_t cur_hw_pstate;
85 // Apply if we are under the EXTERNAL state
86 uint64_t cur_freq_khz;
87 uint64_t max_freq_khz;
88 uint64_t min_freq_khz;
91 uint8_t prior_speedstep;
92 uint8_t turbo_disabled;
97 // This is where we stash Linux's governor when we make a mode switch
98 char * linux_governor;
99 // We have this so we can restore the original frequency when we started
100 uint64_t original_hz;
105 static DEFINE_PER_CPU(struct pstate_core_info, core_state);
109 // These are used to assert DIRECT control over the core pstates
110 struct pstate_core_funcs {
111 void (*arch_init)(void);
112 void (*arch_deinit)(void);
113 uint64_t (*get_min_pstate)(void);
114 uint64_t (*get_max_pstate)(void);
115 uint64_t (*get_pstate)(void);
116 void (*set_pstate)(uint64_t pstate);
119 struct pstate_machine_info {
120 enum {INTEL, AMD, OTHER } arch;
121 int supports_pstates;
131 int have_opportunistic; // this means "Turbo Boost" or "IDA"
132 int have_policy_hint;
133 int have_hwp; // hardware-controlled performance states
134 int have_hdc; // hardware duty cycling
135 int have_mwait_ext; // mwait power extensions
136 int have_mwait_int; // mwait wakes on interrupt
139 int have_pstate_hw_coord; // mperf/aperf
141 // used for DIRECT control
142 struct pstate_core_funcs *funcs;
146 static struct pstate_machine_info machine_state;
149 /****************************************************
151 ***************************************************/
153 /* AMD Programmer's Manual Vol 2 (Rev 3, 2013), Sec. 17.1, pp.557 */
154 #define MSR_PSTATE_LIMIT_REG_AMD 0xc0010061
155 #define MSR_PSTATE_CTL_REG_AMD 0xc0010062
156 #define MSR_PSTATE_STAT_REG_AMD 0xc0010063
158 struct p_state_limit_reg_amd {
162 uint8_t pstate_limit : 4; /* lowest P-state value (highest perf.) supported currently (this can change at runtime) */
163 uint8_t pstate_max : 4; /* highest P-state value supported (lowest perf) */
166 } __attribute__((packed));
167 } __attribute__((packed));
170 struct p_state_stat_reg_amd {
177 } __attribute__((packed));
178 } __attribute__((packed));
181 struct p_state_ctl_reg_amd {
188 } __attribute__((packed));
189 } __attribute__((packed));
192 /* CPUID Fn8000_0007_EDX[HwPstate(7)] = 1 */
193 static uint8_t supports_pstates_amd (void)
195 uint32_t eax, ebx, ecx, edx;
197 cpuid(0x80000007, &eax, &ebx, &ecx, &edx);
198 machine_state.have_pstate = !!(edx & (1 << 7));
199 machine_state.have_coreboost = !!(edx & (1<<9));
200 machine_state.have_feedback = !!(edx & (1<<11));
202 cpuid(0x6, &eax, &ebx, &ecx, &edx);
203 machine_state.have_pstate_hw_coord = !!(ecx & 1);
205 INFO("P-State: AMD: Pstates=%d Coreboost=%d Feedback=%d PstateHWCoord=%d\n",
206 machine_state.have_pstate,
207 machine_state.have_coreboost,
208 machine_state.have_feedback,
209 machine_state.have_pstate_hw_coord);
211 return machine_state.have_pstate;
217 static void init_arch_amd(void)
219 /* KCH: nothing to do here */
223 static void deinit_arch_amd(void)
225 /* KCH: nothing to do here */
229 static uint64_t get_pstate_amd(void)
231 struct p_state_stat_reg_amd pstat;
233 rdmsrl(MSR_PSTATE_STAT_REG_AMD, pstat.val);
235 get_cpu_var(core_state).cur_pstate=pstat.reg.pstate;
236 put_cpu_var(core_state);
238 return pstat.reg.pstate;
242 static void set_pstate_amd(uint64_t p)
244 struct p_state_ctl_reg_amd pctl;
248 wrmsrl(MSR_PSTATE_CTL_REG_AMD, pctl.val);
250 get_cpu_var(core_state).cur_pstate=p;
251 put_cpu_var(core_state);
256 * NOTE: HW may change this value at runtime
258 static uint64_t get_max_pstate_amd(void)
260 struct p_state_limit_reg_amd plimits;
262 rdmsrl(MSR_PSTATE_LIMIT_REG_AMD, plimits.val);
264 return plimits.reg.pstate_max;
268 static uint64_t get_min_pstate_amd(void)
270 struct p_state_limit_reg_amd plimits;
272 rdmsrl(MSR_PSTATE_LIMIT_REG_AMD, plimits.val);
274 return plimits.reg.pstate_limit;
278 static struct pstate_core_funcs amd_funcs =
280 .arch_init = init_arch_amd,
281 .arch_deinit = deinit_arch_amd,
282 .get_pstate = get_pstate_amd,
283 .set_pstate = set_pstate_amd,
284 .get_max_pstate = get_max_pstate_amd,
285 .get_min_pstate = get_min_pstate_amd,
290 /***********************************************************
292 **********************************************************/
296 This implementation uses SpeedStep, but does check
297 to see if the other features (MPERF/APERF, Turbo/IDA, HWP)
301 /* Intel System Programmer's Manual Vol. 3B, 14-2 */
302 #define MSR_MPERF_IA32 0x000000e7
303 #define MSR_APERF_IA32 0x000000e8
304 #define MSR_MISC_ENABLE_IA32 0x000001a0
305 #define MSR_NHM_TURBO_RATIO_LIMIT 0x000001ad
306 #define MSR_PLATFORM_INFO_IA32 0x000000ce
307 #define MSR_PERF_CTL_IA32 0x00000199
308 #define MSR_PERF_STAT_IA32 0x00000198
309 #define MSR_ENERY_PERF_BIAS_IA32 0x000001b0
312 /* Note that the actual meaning of the pstate
313 in the control and status registers is actually
314 implementation dependent, unlike AMD. The "official"
315 way to figure it out the mapping from pstate to
316 these values is via ACPI. What is written in the register
317 is an "id" of an operation point
319 "Often", the 16 bit field consists of a high order byte
320 which is the frequency (the multiplier) and the low order
323 // MSR_PERF_CTL_IA32 r/w
324 struct perf_ctl_reg_intel {
328 // This is the target
329 // Note, not the ACPI pstate, but
330 // Intel's notion of pstate is that it's opaque
331 // for lots of implementations it seems to be
332 // frequency_id : voltage_id
333 // where frequency_id is typically the multiplier
334 uint16_t pstate : 16;
335 uint16_t reserved : 16;
336 // set to 1 to *disengage* dynamic acceleration
337 // Note that "IDA" and "Turbo" use the same interface
338 uint16_t dynamic_accel_disable : 1;
339 uint32_t reserved2 : 31;
341 } __attribute__((packed));
342 } __attribute__((packed));
344 // MSR_PERF_STAT_IA32 r
345 struct perf_stat_reg_intel {
349 // this is the current
350 uint16_t pstate : 16;
351 uint64_t reserved : 48;
353 } __attribute__((packed));
354 } __attribute__((packed));
356 // MSR_ENERGY_PERF_BIAS_IA32 r/w
357 struct enery_perf_bias_reg_intel {
361 // this is the current
362 uint8_t policy_hint : 4;
363 uint64_t reserved : 60;
365 } __attribute__((packed));
366 } __attribute__((packed));
369 struct turbo_mode_info_reg_intel {
374 uint8_t max_noturbo_ratio : 8;
376 uint8_t ppin_cap : 1;
378 uint8_t ratio_limit : 1;
379 uint8_t tdc_tdp_limit : 1;
381 uint8_t min_ratio : 8;
384 } __attribute__((packed));
385 } __attribute__((packed));
387 // This replicates the critical information in Linux's struct acpi_processor_px
388 // To make it easier to port to other OSes.
389 struct intel_pstate_info {
390 uint64_t freq; // KHz
391 uint64_t ctrl; // What to write into the _CTL MSR to get this
394 // The internal array will be used if we cannot build the table locally
395 static struct intel_pstate_info *intel_pstate_to_ctrl_internal=0;
396 static int intel_num_pstates_internal=0;
398 // These will either point to the internal array or to a constructed array
399 static struct intel_pstate_info *intel_pstate_to_ctrl=0;
400 static int intel_num_pstates=0;
403 /* CPUID.01:ECX.AES(7) */
404 static uint8_t supports_pstates_intel(void)
406 /* NOTE: CPUID.06H:ECX.SETBH[bit 3] is set and it also implies the presence of a new architectural MSR called IA32_ENERGY_PERF_BIAS (1B0H).
408 uint32_t eax, ebx, ecx, edx;
410 cpuid(0x1, &eax, &ebx, &ecx, &edx);
411 machine_state.have_speedstep = !!(ecx & (1 << 7));
413 cpuid(0x6, &eax, &ebx, &ecx, &edx);
414 machine_state.have_pstate_hw_coord = !!(ecx & 1); // ?
415 machine_state.have_opportunistic = !!(eax & 1<<1);
416 machine_state.have_policy_hint = !!(ecx & 1<<3);
417 machine_state.have_hwp = !!(eax & 1<<7);
418 machine_state.have_hdc = !!(eax & 1<<13);
420 cpuid(0x5, &eax, &ebx, &ecx, &edx);
421 machine_state.have_mwait_ext = !!(ecx & 1);
422 machine_state.have_mwait_int = !!(ecx & 1<<1);
425 INFO("P-State: Intel: Speedstep=%d, PstateHWCoord=%d, Opportunistic=%d PolicyHint=%d HWP=%d HDC=%d, MwaitExt=%d MwaitInt=%d \n",
426 machine_state.have_speedstep,
427 machine_state.have_pstate_hw_coord,
428 machine_state.have_opportunistic,
429 machine_state.have_policy_hint,
430 machine_state.have_hwp,
431 machine_state.have_hdc,
432 machine_state.have_mwait_ext,
433 machine_state.have_mwait_int );
436 if (machine_state.have_speedstep) {
438 // Build mapping table (from "pstate" (0..) to ctrl value for MSR
439 if (!(get_cpu_var(processors)) || !(get_cpu_var(processors)->performance) ) {
440 put_cpu_var(processors);
441 // no acpi... revert to internal table
442 intel_pstate_to_ctrl=intel_pstate_to_ctrl_internal;
443 intel_num_pstates=intel_num_pstates_internal;
445 intel_num_pstates = get_cpu_var(processors)->performance->state_count;
446 if (intel_num_pstates) {
447 intel_pstate_to_ctrl = palacios_alloc(sizeof(struct intel_pstate_info)*intel_num_pstates);
448 if (!intel_pstate_to_ctrl) {
449 ERROR("P-State: Cannot allocate space for mapping...\n");
452 for (i=0;i<intel_num_pstates;i++) {
453 intel_pstate_to_ctrl[i].freq = get_cpu_var(processors)->performance->states[i].core_frequency*1000;
454 intel_pstate_to_ctrl[i].ctrl = get_cpu_var(processors)->performance->states[i].control;
458 ERROR("P-State: Strange, machine has ACPI DVFS but no states...\n");
461 put_cpu_var(processors);
462 INFO("P-State: Intel - State Mapping (%u states) follows\n",intel_num_pstates);
463 for (i=0;i<intel_num_pstates;i++) {
464 INFO("P-State: Intel Mapping %u: freq=%llu ctrl=%llx\n",
465 i, intel_pstate_to_ctrl[i].freq,intel_pstate_to_ctrl[i].ctrl);
468 INFO("P-State: Intel: No speedstep here\n");
472 return machine_state.have_speedstep;
476 static void init_arch_intel(void)
480 rdmsrl(MSR_MISC_ENABLE_IA32, val);
482 //INFO("P-State: prior ENABLE=%llx\n",val);
484 // store prior speedstep setting
485 get_cpu_var(core_state).prior_speedstep=(val >> 16) & 0x1;
486 put_cpu_var(core_state);
488 // enable speedstep (probably already on)
490 wrmsrl(MSR_MISC_ENABLE_IA32, val);
492 //INFO("P-State: write ENABLE=%llx\n",val);
496 static void deinit_arch_intel(void)
500 rdmsrl(MSR_MISC_ENABLE_IA32, val);
502 //INFO("P-State: deinit: ENABLE=%llx\n",val);
504 val &= ~(1ULL << 16);
505 val |= get_cpu_var(core_state).prior_speedstep << 16;
506 put_cpu_var(core_state);
508 wrmsrl(MSR_MISC_ENABLE_IA32, val);
510 //INFO("P-state: deinit ENABLE=%llx\n",val);
514 /* TODO: Intel P-states require sampling at intervals... */
515 static uint64_t get_pstate_intel(void)
519 rdmsrl(MSR_PERF_STAT_IA32,val);
521 //INFO("P-State: Get: 0x%llx\n", val);
523 // should check if turbo is active, in which case
524 // this value is not the whole story
529 static void set_pstate_intel(uint64_t p)
534 if (intel_num_pstates==0) {
537 if (p>=intel_num_pstates) {
538 p=intel_num_pstates-1;
542 ctrl=intel_pstate_to_ctrl[p].ctrl;
544 /* ...Intel IDA (dynamic acceleration)
545 if (c->no_turbo && !c->turbo_disabled) {
549 // leave all bits along expect for the likely
552 rdmsrl(MSR_PERF_CTL_IA32, val);
553 INFO("P-State: Pre-Set: 0x%llx\n", val);
556 val |= ctrl & 0xffffULL;
558 INFO("P-State: Set: 0x%llx\n", val);
560 wrmsrl(MSR_PERF_CTL_IA32, val);
562 get_cpu_var(core_state).cur_pstate = p;
563 put_cpu_var(core_state);
567 static uint64_t get_min_pstate_intel(void)
574 static uint64_t get_max_pstate_intel (void)
576 if (intel_num_pstates==0) {
579 return intel_num_pstates-1;
583 static struct pstate_core_funcs intel_funcs =
585 .arch_init = init_arch_intel,
586 .arch_deinit = deinit_arch_intel,
587 .get_pstate = get_pstate_intel,
588 .set_pstate = set_pstate_intel,
589 .get_max_pstate = get_max_pstate_intel,
590 .get_min_pstate = get_min_pstate_intel,
595 /***********************************************
596 Arch determination and setup
597 ***********************************************/
599 static inline void cpuid_string (uint32_t id, uint32_t dest[4])
602 :"=a"(*dest),"=b"(*(dest+1)),"=c"(*(dest+2)),"=d"(*(dest+3))
607 static int get_cpu_vendor (char name[13])
612 cpuid_string(0,dest);
614 ((uint32_t*)name)[0]=dest[1];
615 ((uint32_t*)name)[1]=dest[3];
616 ((uint32_t*)name)[2]=dest[2];
623 static int is_intel (void)
626 get_cpu_vendor(name);
627 return !strcmp(name,"GenuineIntel");
631 static int is_amd (void)
634 get_cpu_vendor(name);
635 return !strcmp(name,"AuthenticAMD");
638 static int pstate_arch_setup(void)
642 machine_state.arch = AMD;
643 machine_state.funcs = &amd_funcs;
644 machine_state.supports_pstates = supports_pstates_amd();
645 INFO("PSTATE: P-State initialized for AMD\n");
646 } else if (is_intel()) {
647 machine_state.arch = INTEL;
648 machine_state.funcs = &intel_funcs;
649 machine_state.supports_pstates = supports_pstates_intel();
650 INFO("PSTATE: P-State initialized for INTEL (Work in progress...)\n");
654 machine_state.arch = OTHER;
655 machine_state.funcs = NULL;
656 machine_state.supports_pstates = 0;
657 INFO("PSTATE: P-state control: No support for direct control on this architecture\n");
666 /******************************************************************
668 *****************************************************************/
672 * This stub governor is simply a placeholder for preventing
673 * frequency changes from the Linux side. For now, we simply leave
674 * the frequency as is when we acquire control.
676 static int governor_run(struct cpufreq_policy *policy, unsigned int event)
680 /* we can't use cpufreq_driver_target here as it can result
681 * in a circular dependency, so we'll just do nothing.
683 case CPUFREQ_GOV_START:
684 case CPUFREQ_GOV_STOP:
685 case CPUFREQ_GOV_LIMITS:
689 ERROR("Undefined governor command\n");
697 static struct cpufreq_governor stub_governor =
699 .name = PALACIOS_GOVNAME,
700 .governor = governor_run,
701 .owner = THIS_MODULE,
705 static inline void pstate_register_linux_governor(void)
707 cpufreq_register_governor(&stub_governor);
711 static inline void pstate_unregister_linux_governor(void)
713 cpufreq_unregister_governor(&stub_governor);
717 static int get_current_governor(char **buf, unsigned int cpu)
719 struct cpufreq_policy * policy = palacios_alloc(sizeof(struct cpufreq_policy));
720 char * govname = NULL;
723 ERROR("could not allocate cpufreq_policy\n");
727 if (cpufreq_get_policy(policy, cpu) != 0) {
728 ERROR("Could not get current cpufreq policy\n");
732 /* We're in interrupt context, should probably not wait here */
733 govname = palacios_alloc(MAX_GOV_NAME_LEN);
735 ERROR("Could not allocate space for governor name\n");
739 strncpy(govname, policy->governor->name, MAX_GOV_NAME_LEN);
741 get_cpu_var(core_state).linux_governor = govname;
742 put_cpu_var(core_state);
746 palacios_free(policy);
751 palacios_free(policy);
756 /* passed to the userspacehelper interface for cleanup */
757 static void gov_switch_cleanup(struct subprocess_info * s)
759 palacios_free(s->argv[2]);
760 palacios_free(s->argv);
766 * @s - the governor to switch to
768 static int governor_switch(char * s, unsigned int cpu)
770 char * path_str = NULL;
773 static char * envp[] = {
776 "PATH=/sbin:/bin:/usr/sbin:/usr/bin", NULL };
779 argv = palacios_alloc(4*sizeof(char*));
781 ERROR("Couldn't allocate argv struct\n");
785 path_str = palacios_alloc(MAX_PATH_LEN);
787 ERROR("Couldn't allocate path string\n");
790 memset(path_str, 0, MAX_PATH_LEN);
792 snprintf(path_str, MAX_PATH_LEN, "echo %s > /sys/devices/system/cpu/cpu%u/cpufreq/scaling_governor", s, cpu);
799 /* KCH: we can't wait here to actually see if we succeeded, we're in interrupt context */
800 return call_usermodehelper_fns("/bin/sh", argv, envp, UMH_NO_WAIT, NULL, gov_switch_cleanup, NULL);
808 static inline void free_linux_governor(void)
810 palacios_free(get_cpu_var(core_state).linux_governor);
811 put_cpu_var(core_state);
815 static int linux_setup_palacios_governor(void)
818 unsigned int cpu = get_cpu();
820 /* KCH: we assume the v3vee governor is already
821 * registered with kernel by this point
824 if (get_current_governor(&gov, cpu) < 0) {
825 ERROR("Could not get current governor\n");
829 DEBUG("saving current governor (%s)\n", gov);
831 get_cpu_var(core_state).linux_governor = gov;
832 put_cpu_var(core_state);
834 DEBUG("setting the new governor (%s)\n", PALACIOS_GOVNAME);
836 /* set the new one to ours */
837 if (governor_switch(PALACIOS_GOVNAME, cpu) < 0) {
838 ERROR("Could not set governor to (%s)\n", PALACIOS_GOVNAME);
847 static int linux_get_pstate(void)
849 struct cpufreq_policy * policy = NULL;
850 struct cpufreq_frequency_table *table;
853 unsigned int count = 0;
855 policy = palacios_alloc(sizeof(struct cpufreq_policy));
857 ERROR("Could not allocate policy struct\n");
861 cpufreq_get_policy(policy, cpu);
862 table = cpufreq_frequency_get_table(cpu);
864 for (i = 0; (table[i].frequency != CPUFREQ_TABLE_END); i++) {
866 if (table[i].frequency == CPUFREQ_ENTRY_INVALID) {
870 if (table[i].frequency == policy->cur) {
877 palacios_free(policy);
884 static int linux_get_freq(void)
886 struct cpufreq_policy * policy = NULL;
889 policy = palacios_alloc(sizeof(struct cpufreq_policy));
891 ERROR("Could not allocate policy struct\n");
895 if (cpufreq_get_policy(policy, cpu)) {
896 ERROR("Could not get current policy\n");
904 static int linux_set_pstate(uint8_t p)
906 struct cpufreq_policy * policy = NULL;
907 struct cpufreq_frequency_table *table;
910 unsigned int count = 0;
914 policy = palacios_alloc(sizeof(struct cpufreq_policy));
916 ERROR("Could not allocate policy struct\n");
920 if (cpufreq_get_policy(policy, cpu)) {
921 ERROR("Could not get current policy\n");
924 table = cpufreq_frequency_get_table(cpu);
926 for (i = 0; (table[i].frequency != CPUFREQ_TABLE_END); i++) {
928 if (table[i].frequency == CPUFREQ_ENTRY_INVALID) {
933 cpufreq_driver_target(policy, table[i].frequency, CPUFREQ_RELATION_H);
941 /* we need to deal with the case in which we get a number > max pstate */
943 cpufreq_driver_target(policy, table[last_valid].frequency, CPUFREQ_RELATION_H);
946 palacios_free(policy);
950 palacios_free(policy);
955 static int linux_set_freq(uint64_t f)
957 struct cpufreq_policy * policy = NULL;
961 policy = palacios_alloc(sizeof(struct cpufreq_policy));
963 ERROR("Could not allocate policy struct\n");
967 cpufreq_get_policy(policy, cpu);
969 if (f < policy->min) {
971 } else if (f > policy->max) {
977 cpufreq_driver_target(policy, freq, CPUFREQ_RELATION_H);
979 palacios_free(policy);
984 static int linux_restore_defaults(void)
986 unsigned int cpu = get_cpu();
989 gov = get_cpu_var(core_state).linux_governor;
990 put_cpu_var(core_state);
992 DEBUG("restoring previous governor (%s)\n", gov);
994 if (governor_switch(gov, cpu) < 0) {
995 ERROR("Could not restore governor to (%s)\n", gov);
999 free_linux_governor();
1003 free_linux_governor();
1009 /******************************************************************
1010 Generic Interface as provided to Palacios and to the rest of the
1012 ******************************************************************/
1014 static void init_core(void)
1017 struct cpufreq_policy *p;
1021 DEBUG("P-State Core Init\n");
1023 get_cpu_var(core_state).mode = V3_PSTATE_HOST_CONTROL;
1024 get_cpu_var(core_state).cur_pstate = 0;
1026 if (machine_state.funcs) {
1027 get_cpu_var(core_state).min_pstate = machine_state.funcs->get_min_pstate();
1028 get_cpu_var(core_state).max_pstate = machine_state.funcs->get_max_pstate();
1030 get_cpu_var(core_state).min_pstate = 0;
1031 get_cpu_var(core_state).max_pstate = 0;
1035 cpu = get_cpu(); put_cpu();
1037 p = cpufreq_cpu_get(cpu);
1040 get_cpu_var(core_state).have_cpufreq = 0;
1041 get_cpu_var(core_state).min_freq_khz=0;
1042 get_cpu_var(core_state).max_freq_khz=0;
1043 get_cpu_var(core_state).cur_freq_khz=0;
1045 get_cpu_var(core_state).have_cpufreq = 1;
1046 get_cpu_var(core_state).min_freq_khz=p->min;
1047 get_cpu_var(core_state).max_freq_khz=p->max;
1048 get_cpu_var(core_state).cur_freq_khz=p->cur;
1053 put_cpu_var(core_state);
1055 for (i=0;i<get_cpu_var(processors)->performance->state_count; i++) {
1056 INFO("P-State: %u: freq=%llu ctrl=%llx",
1058 get_cpu_var(processors)->performance->states[i].core_frequency*1000,
1059 get_cpu_var(processors)->performance->states[i].control);
1061 put_cpu_var(processors);
1065 void palacios_pstate_ctrl_release(void);
1068 static void deinit_core(void)
1070 DEBUG("P-State Core Deinit\n");
1071 palacios_pstate_ctrl_release();
1076 void palacios_pstate_ctrl_get_chars(struct v3_cpu_pstate_chars *c)
1078 memset(c,0,sizeof(struct v3_cpu_pstate_chars));
1081 c->features = V3_PSTATE_INTERNAL_CONTROL;
1083 if (get_cpu_var(core_state).have_cpufreq) {
1084 c->features |= V3_PSTATE_EXTERNAL_CONTROL;
1087 if (machine_state.arch==AMD || machine_state.arch==INTEL) {
1088 c->features |= V3_PSTATE_DIRECT_CONTROL;
1090 c->cur_mode = get_cpu_var(core_state).mode;
1091 c->min_pstate = get_cpu_var(core_state).min_pstate;
1092 c->max_pstate = get_cpu_var(core_state).max_pstate;
1093 c->cur_pstate = get_cpu_var(core_state).cur_pstate;
1094 c->min_freq_khz = get_cpu_var(core_state).min_freq_khz;
1095 c->max_freq_khz = get_cpu_var(core_state).max_freq_khz;
1096 c->cur_freq_khz = get_cpu_var(core_state).cur_freq_khz;
1098 put_cpu_var(core_state);
1105 uint64_t palacios_pstate_ctrl_get_pstate(void)
1107 if (get_cpu_var(core_state).mode==V3_PSTATE_DIRECT_CONTROL) {
1108 put_cpu_var(core_state);
1109 return machine_state.funcs->get_pstate();
1110 } else if (get_cpu_var(core_state).mode==V3_PSTATE_EXTERNAL_CONTROL) {
1111 put_cpu_var(core_state);
1112 return linux_get_pstate();
1114 put_cpu_var(core_state);
1120 void palacios_pstate_ctrl_set_pstate(uint64_t p)
1122 if (get_cpu_var(core_state).mode==V3_PSTATE_DIRECT_CONTROL) {
1123 put_cpu_var(core_state);
1124 machine_state.funcs->set_pstate(p);
1125 } else if (get_cpu_var(core_state).mode==V3_PSTATE_EXTERNAL_CONTROL) {
1126 put_cpu_var(core_state);
1127 linux_set_pstate(p);
1132 void palacios_pstate_ctrl_set_pstate_wrapper(void *p)
1134 palacios_pstate_ctrl_set_pstate((uint8_t)(uint64_t)p);
1138 uint64_t palacios_pstate_ctrl_get_freq(void)
1140 if (get_cpu_var(core_state).mode==V3_PSTATE_EXTERNAL_CONTROL) {
1141 put_cpu_var(core_state);
1142 return linux_get_freq();
1144 put_cpu_var(core_state);
1150 void palacios_pstate_ctrl_set_freq(uint64_t p)
1152 if (get_cpu_var(core_state).mode==V3_PSTATE_EXTERNAL_CONTROL) {
1153 put_cpu_var(core_state);
1156 put_cpu_var(core_state);
1160 static int switch_to_external(void)
1162 DEBUG("switch from host control to external\n");
1164 if (!(get_cpu_var(core_state).have_cpufreq)) {
1165 put_cpu_var(core_state);
1166 ERROR("No cpufreq - cannot switch to external...\n");
1169 put_cpu_var(core_state);
1171 linux_setup_palacios_governor();
1173 get_cpu_var(core_state).mode=V3_PSTATE_EXTERNAL_CONTROL;
1174 put_cpu_var(core_state);
1180 static int switch_to_direct(void)
1182 DEBUG("switch from host control to direct\n");
1184 if (get_cpu_var(core_state).have_cpufreq) {
1185 put_cpu_var(core_state);
1186 DEBUG("switch to direct from cpufreq\n");
1188 // The implementation would set the policy and governor to peg cpu
1189 // regardless of load
1190 linux_setup_palacios_governor();
1193 if (machine_state.funcs && machine_state.funcs->arch_init) {
1194 get_cpu_var(core_state).mode=V3_PSTATE_DIRECT_CONTROL;
1196 machine_state.funcs->arch_init();
1198 put_cpu_var(core_state);
1205 static int switch_to_internal(void)
1207 DEBUG("switch from host control to internal\n");
1209 if (get_cpu_var(core_state).have_cpufreq) {
1210 put_cpu_var(core_state);
1211 DEBUG("switch to internal on machine with cpu freq\n");
1212 linux_setup_palacios_governor();
1215 get_cpu_var(core_state).mode=V3_PSTATE_INTERNAL_CONTROL;
1217 put_cpu_var(core_state);
1223 static int switch_from_external(void)
1225 if (!(get_cpu_var(core_state).have_cpufreq)) {
1226 put_cpu_var(core_state);
1227 ERROR("No cpufreq - how did we get here... external...\n");
1231 DEBUG("Switching back to host control from external\n");
1233 if (get_cpu_var(core_state).have_cpufreq) {
1234 linux_restore_defaults();
1237 get_cpu_var(core_state).mode = V3_PSTATE_HOST_CONTROL;
1239 put_cpu_var(core_state);
1245 static int switch_from_direct(void)
1248 DEBUG("Switching back to host control from direct\n");
1250 // Set maximum performance, just in case there is no host control
1251 machine_state.funcs->set_pstate(get_cpu_var(core_state).min_pstate);
1252 machine_state.funcs->arch_deinit();
1254 if (get_cpu_var(core_state).have_cpufreq) {
1255 linux_restore_defaults();
1258 get_cpu_var(core_state).mode=V3_PSTATE_HOST_CONTROL;
1260 put_cpu_var(core_state);
1266 static int switch_from_internal(void)
1268 DEBUG("Switching back to host control from internal\n");
1270 if (get_cpu_var(core_state).have_cpufreq) {
1271 // ERROR("Unimplemented: switch from internal on machine with cpu freq - will just pretend to do so\n");
1272 // The implementation would switch back to default policy and governor
1273 linux_restore_defaults();
1276 get_cpu_var(core_state).mode=V3_PSTATE_HOST_CONTROL;
1278 put_cpu_var(core_state);
1285 void palacios_pstate_ctrl_acquire(uint32_t type)
1287 if (get_cpu_var(core_state).mode != V3_PSTATE_HOST_CONTROL) {
1288 palacios_pstate_ctrl_release();
1291 put_cpu_var(core_state);
1294 case V3_PSTATE_EXTERNAL_CONTROL:
1295 switch_to_external();
1297 case V3_PSTATE_DIRECT_CONTROL:
1300 case V3_PSTATE_INTERNAL_CONTROL:
1301 switch_to_internal();
1304 ERROR("Unknown pstate control type %u\n",type);
1310 // Wrappers for xcalls
1311 static void palacios_pstate_ctrl_acquire_external(void)
1313 palacios_pstate_ctrl_acquire(V3_PSTATE_EXTERNAL_CONTROL);
1316 static void palacios_pstate_ctrl_acquire_direct(void)
1318 palacios_pstate_ctrl_acquire(V3_PSTATE_DIRECT_CONTROL);
1322 void palacios_pstate_ctrl_release(void)
1324 if (get_cpu_var(core_state).mode == V3_PSTATE_HOST_CONTROL) {
1325 put_cpu_var(core_state);
1329 switch (get_cpu_var(core_state).mode) {
1330 case V3_PSTATE_EXTERNAL_CONTROL:
1331 switch_from_external();
1333 case V3_PSTATE_DIRECT_CONTROL:
1334 switch_from_direct();
1336 case V3_PSTATE_INTERNAL_CONTROL:
1337 switch_from_internal();
1340 ERROR("Unknown pstate control type %u\n",core_state.mode);
1344 put_cpu_var(core_state);
1349 static void update_hw_pstate(void *arg)
1351 if (machine_state.funcs && machine_state.funcs->get_pstate) {
1352 get_cpu_var(core_state).cur_hw_pstate = machine_state.funcs->get_pstate();
1353 put_cpu_var(core_state);
1355 get_cpu_var(core_state).cur_hw_pstate = 0;
1356 put_cpu_var(core_state);
1361 /***************************************************************************
1362 PROC Interface to expose state
1363 ***************************************************************************/
1365 static int pstate_show(struct seq_file * file, void * v)
1368 unsigned int numcpus = num_online_cpus();
1370 seq_printf(file, "V3VEE DVFS Status\n\n");
1372 for (cpu=0;cpu<numcpus;cpu++) {
1373 palacios_xcall(cpu,update_hw_pstate,0);
1376 seq_printf(file, "Arch:\t%s\nPStates:\t%s\n\n",
1377 machine_state.arch==INTEL ? "Intel" :
1378 machine_state.arch==AMD ? "AMD" : "Other",
1379 machine_state.supports_pstates ? "Yes" : "No");
1381 for (cpu=0;cpu<numcpus;cpu++) {
1382 struct pstate_core_info *s = &per_cpu(core_state,cpu);
1383 seq_printf(file,"pcore %u: hw pstate 0x%x mode %s of [ host ",cpu,
1385 s->mode==V3_PSTATE_HOST_CONTROL ? "host" :
1386 s->mode==V3_PSTATE_EXTERNAL_CONTROL ? "external" :
1387 s->mode==V3_PSTATE_DIRECT_CONTROL ? "direct" :
1388 s->mode==V3_PSTATE_INTERNAL_CONTROL ? "internal" : "UNKNOWN");
1389 if (s->have_cpufreq) {
1390 seq_printf(file,"external ");
1392 if (machine_state.supports_pstates) {
1393 seq_printf(file,"direct ");
1395 seq_printf(file,"internal ] ");
1396 if (s->mode==V3_PSTATE_EXTERNAL_CONTROL) {
1397 seq_printf(file,"(min=%llu max=%llu cur=%llu) ", s->min_freq_khz, s->max_freq_khz, s->cur_freq_khz);
1399 if (s->mode==V3_PSTATE_DIRECT_CONTROL) {
1400 seq_printf(file,"(min=%u max=%u cur=%u) ", (uint32_t)s->min_pstate, (uint32_t)s->max_pstate, (uint32_t)s->cur_pstate);
1402 seq_printf(file,"\n");
1407 static int pstate_open(struct inode * inode, struct file * file)
1409 return single_open(file, pstate_show, NULL);
1413 static struct file_operations pstate_fops = {
1414 .owner = THIS_MODULE,
1415 .open = pstate_open,
1417 .llseek = seq_lseek,
1418 .release = seq_release
1421 int pstate_proc_setup(void)
1423 struct proc_dir_entry *proc;
1425 proc = create_proc_entry("v3-dvfs",0444, palacios_get_procdir());
1428 ERROR("Failed to create proc entry for p-state control\n");
1432 proc->proc_fops = &pstate_fops;
1437 void pstate_proc_teardown(void)
1439 remove_proc_entry("v3-dvfs",palacios_get_procdir());
1442 /********************************************************************
1443 User interface (ioctls)
1444 ********************************************************************/
1446 static int dvfs_ctrl(unsigned int cmd, unsigned long arg)
1448 struct v3_dvfs_ctrl_request r;
1450 if (copy_from_user(&r,(void __user*)arg,sizeof(struct v3_dvfs_ctrl_request))) {
1451 ERROR("Failed to copy DVFS request from user\n");
1455 if (r.pcore >= num_online_cpus()) {
1456 ERROR("Cannot apply DVFS request to pcore %u\n",r.pcore);
1461 case V3_DVFS_ACQUIRE: {
1462 switch (r.acq_type) {
1463 case V3_DVFS_EXTERNAL:
1464 palacios_xcall(r.pcore,(void (*)(void*))palacios_pstate_ctrl_acquire_external, NULL);
1467 case V3_DVFS_DIRECT:
1468 palacios_xcall(r.pcore,(void (*)(void*))palacios_pstate_ctrl_acquire_direct, NULL);
1472 ERROR("Unknown DVFS acquire type %u\n",r.acq_type);
1477 case V3_DVFS_RELEASE: {
1478 palacios_xcall(r.pcore,(void (*)(void*))palacios_pstate_ctrl_release, NULL);
1482 case V3_DVFS_SETFREQ: {
1483 palacios_xcall(r.pcore,(void (*)(void*))palacios_pstate_ctrl_set_freq,(void*)r.freq_khz);
1487 case V3_DVFS_SETPSTATE: {
1488 palacios_xcall(r.pcore,(void (*)(void*))palacios_pstate_ctrl_set_pstate_wrapper,(void*)(uint64_t)r.pstate);
1492 ERROR("Unknown DVFS command %u\n",r.cmd);
1500 void pstate_user_setup(void)
1502 add_global_ctrl(V3_DVFS_CTRL, dvfs_ctrl);
1506 void pstate_user_teardown(void)
1508 remove_global_ctrl(V3_DVFS_CTRL);
1511 static struct v3_host_pstate_ctrl_iface hooks = {
1512 .get_chars = palacios_pstate_ctrl_get_chars,
1513 .acquire = palacios_pstate_ctrl_acquire,
1514 .release = palacios_pstate_ctrl_release,
1515 .set_pstate = palacios_pstate_ctrl_set_pstate,
1516 .get_pstate = palacios_pstate_ctrl_get_pstate,
1517 .set_freq = palacios_pstate_ctrl_set_freq,
1518 .get_freq = palacios_pstate_ctrl_get_freq,
1523 static int pstate_ctrl_init(void)
1526 unsigned int numcpus = num_online_cpus();
1528 pstate_arch_setup();
1530 for (cpu=0;cpu<numcpus;cpu++) {
1531 palacios_xcall(cpu,(void ((*)(void*)))init_core,0);
1534 V3_Init_Pstate_Ctrl(&hooks);
1536 if (pstate_proc_setup()) {
1537 ERROR("Unable to initialize P-State Control\n");
1541 pstate_user_setup();
1543 pstate_register_linux_governor();
1545 INFO("P-State Control Initialized\n");
1550 static int pstate_ctrl_deinit(void)
1553 unsigned int numcpus=num_online_cpus();
1555 pstate_unregister_linux_governor();
1557 pstate_user_teardown();
1559 pstate_proc_teardown();
1561 // release pstate control if we have it, and we need to do this on each processor
1562 for (cpu=0;cpu<numcpus;cpu++) {
1563 palacios_xcall(cpu,(void (*)(void *))deinit_core,0);
1567 // Free any mapping table we built for Intel
1568 if (intel_pstate_to_ctrl && intel_pstate_to_ctrl != intel_pstate_to_ctrl_internal) {
1569 palacios_free(intel_pstate_to_ctrl);
1577 static struct linux_ext pstate_ext = {
1578 .name = "PSTATE_CTRL",
1579 .init = pstate_ctrl_init,
1580 .deinit = pstate_ctrl_deinit,
1582 .guest_deinit = NULL,
1586 register_extension(&pstate_ext);