2 * This file is part of the Palacios Virtual Machine Monitor developed
3 * by the V3VEE Project with funding from the United States National
4 * Science Foundation and the Department of Energy.
6 * The V3VEE Project is a joint project between Northwestern University
7 * and the University of New Mexico. You can find out more at
10 * Copyright (c) 2014, the V3VEE Project <http://www.v3vee.org>
11 * all rights reserved.
13 * Author: Kyle C. Hale <kh@u.northwestern.edu>
14 * Shiva Rao <shiva.rao.717@gmail.com>
15 * Peter Dinda <pdinda@northwestern.edu>
17 * This is free software. you are permitted to use,
18 * redistribute, and modify it as specified in the file "V3VEE_LICENSE".
21 #include <linux/uaccess.h>
22 #include <linux/seq_file.h>
23 #include <linux/proc_fs.h>
24 #include <linux/cpufreq.h>
25 #include <linux/kernel.h>
26 #include <linux/kmod.h>
27 #include <linux/module.h>
28 #include <linux/string.h>
29 #include <linux/interrupt.h>
30 #include <asm/processor.h>
32 #include <asm/msr-index.h>
34 // Used to determine the appropriate pstates values on Intel
35 #include <linux/acpi.h>
36 #include <acpi/processor.h>
38 #include <interfaces/vmm_pstate_ctrl.h>
41 #include "iface-pstate-ctrl.h"
43 #include "linux-exts.h"
46 This P-STATE control implementation includes:
48 - Direct control of Intel and AMD processor pstates
49 - External control of processor states via Linux (unimplemented)
50 - Internal control of processor states in Palacios (handoff from Linux)
52 Additionally, it provides a user-space interface for manipulating
53 p-state regardless of the host's functionality. This includes
54 an ioctl for commanding the implementation and a /proc file for
55 showing current status and capabilities.
57 What we mean by "pstate" here is the processor's internal
58 configuration. For AMD, this is defined as being the same as
59 the ACPI-defined p-state. For Intel, it is not. There, it is the
60 contents of the perf ctl MSR, which, often, is the frequency id
61 and voltage id (the multipliers).
66 #define PALACIOS_GOVNAME "v3vee"
67 #define MAX_PATH_LEN 128
68 #define MAX_GOV_NAME_LEN 16
71 struct pstate_core_info {
72 // Here we have the notion of host control
73 #define V3_PSTATE_HOST_CONTROL 0
74 // and all the modes from the Palacios interface:
75 // V3_PSTATE_EXTERNAL_CONTROL
76 // V3_PSTATE_DIRECT_CONTROL
77 // V3_PSTATE_INTERNAL_CONTROL
80 // Apply if we are under the DIRECT state
85 uint8_t cur_hw_pstate;
87 // Apply if we are under the EXTERNAL state
88 uint64_t set_freq_khz; // this is the frequency we're hoping to get
89 uint64_t cur_freq_khz;
90 uint64_t max_freq_khz;
91 uint64_t min_freq_khz;
94 uint8_t prior_speedstep;
95 uint8_t turbo_disabled;
100 // This is where we stash Linux's governor when we make a mode switch
101 char * linux_governor;
102 // We have this so we can restore the original frequency when we started
103 uint64_t original_hz;
108 static DEFINE_PER_CPU(struct pstate_core_info, core_state);
112 // These are used to assert DIRECT control over the core pstates
113 struct pstate_core_funcs {
114 void (*arch_init)(void);
115 void (*arch_deinit)(void);
116 uint64_t (*get_min_pstate)(void);
117 uint64_t (*get_max_pstate)(void);
118 uint64_t (*get_pstate)(void);
119 void (*set_pstate)(uint64_t pstate);
122 struct pstate_machine_info {
123 enum {INTEL, AMD, OTHER } arch;
124 int supports_pstates;
134 int have_opportunistic; // this means "Turbo Boost" or "IDA"
135 int have_policy_hint;
136 int have_hwp; // hardware-controlled performance states
137 int have_hdc; // hardware duty cycling
138 int have_mwait_ext; // mwait power extensions
139 int have_mwait_int; // mwait wakes on interrupt
142 int have_pstate_hw_coord; // mperf/aperf
144 // used for DIRECT control
145 struct pstate_core_funcs *funcs;
149 static struct pstate_machine_info machine_state;
152 /****************************************************
154 ***************************************************/
156 /* AMD Programmer's Manual Vol 2 (Rev 3, 2013), Sec. 17.1, pp.557 */
157 #define MSR_PSTATE_LIMIT_REG_AMD 0xc0010061
158 #define MSR_PSTATE_CTL_REG_AMD 0xc0010062
159 #define MSR_PSTATE_STAT_REG_AMD 0xc0010063
161 struct p_state_limit_reg_amd {
165 uint8_t pstate_limit : 4; /* lowest P-state value (highest perf.) supported currently (this can change at runtime) */
166 uint8_t pstate_max : 4; /* highest P-state value supported (lowest perf) */
169 } __attribute__((packed));
170 } __attribute__((packed));
173 struct p_state_stat_reg_amd {
180 } __attribute__((packed));
181 } __attribute__((packed));
184 struct p_state_ctl_reg_amd {
191 } __attribute__((packed));
192 } __attribute__((packed));
195 /* CPUID Fn8000_0007_EDX[HwPstate(7)] = 1 */
196 static uint8_t supports_pstates_amd (void)
198 uint32_t eax, ebx, ecx, edx;
200 cpuid(0x80000007, &eax, &ebx, &ecx, &edx);
201 machine_state.have_pstate = !!(edx & (1 << 7));
202 machine_state.have_coreboost = !!(edx & (1<<9));
203 machine_state.have_feedback = !!(edx & (1<<11));
205 cpuid(0x6, &eax, &ebx, &ecx, &edx);
206 machine_state.have_pstate_hw_coord = !!(ecx & 1);
208 INFO("P-State: AMD: Pstates=%d Coreboost=%d Feedback=%d PstateHWCoord=%d\n",
209 machine_state.have_pstate,
210 machine_state.have_coreboost,
211 machine_state.have_feedback,
212 machine_state.have_pstate_hw_coord);
214 return machine_state.have_pstate;
220 static void init_arch_amd(void)
222 /* KCH: nothing to do here */
226 static void deinit_arch_amd(void)
228 /* KCH: nothing to do here */
232 static uint64_t get_pstate_amd(void)
234 struct p_state_stat_reg_amd pstat;
236 rdmsrl(MSR_PSTATE_STAT_REG_AMD, pstat.val);
238 get_cpu_var(core_state).cur_pstate=pstat.reg.pstate;
239 put_cpu_var(core_state);
241 return pstat.reg.pstate;
245 static void set_pstate_amd(uint64_t p)
247 struct p_state_ctl_reg_amd pctl;
251 wrmsrl(MSR_PSTATE_CTL_REG_AMD, pctl.val);
253 get_cpu_var(core_state).cur_pstate=p;
254 put_cpu_var(core_state);
259 * NOTE: HW may change this value at runtime
261 static uint64_t get_max_pstate_amd(void)
263 struct p_state_limit_reg_amd plimits;
265 rdmsrl(MSR_PSTATE_LIMIT_REG_AMD, plimits.val);
267 return plimits.reg.pstate_max;
271 static uint64_t get_min_pstate_amd(void)
273 struct p_state_limit_reg_amd plimits;
275 rdmsrl(MSR_PSTATE_LIMIT_REG_AMD, plimits.val);
277 return plimits.reg.pstate_limit;
281 static struct pstate_core_funcs amd_funcs =
283 .arch_init = init_arch_amd,
284 .arch_deinit = deinit_arch_amd,
285 .get_pstate = get_pstate_amd,
286 .set_pstate = set_pstate_amd,
287 .get_max_pstate = get_max_pstate_amd,
288 .get_min_pstate = get_min_pstate_amd,
293 /***********************************************************
295 **********************************************************/
299 This implementation uses SpeedStep, but does check
300 to see if the other features (MPERF/APERF, Turbo/IDA, HWP)
304 /* Intel System Programmer's Manual Vol. 3B, 14-2 */
305 #define MSR_MPERF_IA32 0x000000e7
306 #define MSR_APERF_IA32 0x000000e8
307 #define MSR_MISC_ENABLE_IA32 0x000001a0
308 #define MSR_NHM_TURBO_RATIO_LIMIT 0x000001ad
309 #define MSR_PLATFORM_INFO_IA32 0x000000ce
310 #define MSR_PERF_CTL_IA32 0x00000199
311 #define MSR_PERF_STAT_IA32 0x00000198
312 #define MSR_ENERY_PERF_BIAS_IA32 0x000001b0
315 /* Note that the actual meaning of the pstate
316 in the control and status registers is actually
317 implementation dependent, unlike AMD. The "official"
318 way to figure it out the mapping from pstate to
319 these values is via ACPI. What is written in the register
320 is an "id" of an operation point
322 "Often", the 16 bit field consists of a high order byte
323 which is the frequency (the multiplier) and the low order
326 // MSR_PERF_CTL_IA32 r/w
327 struct perf_ctl_reg_intel {
331 // This is the target
332 // Note, not the ACPI pstate, but
333 // Intel's notion of pstate is that it's opaque
334 // for lots of implementations it seems to be
335 // frequency_id : voltage_id
336 // where frequency_id is typically the multiplier
337 uint16_t pstate : 16;
338 uint16_t reserved : 16;
339 // set to 1 to *disengage* dynamic acceleration
340 // Note that "IDA" and "Turbo" use the same interface
341 uint16_t dynamic_accel_disable : 1;
342 uint32_t reserved2 : 31;
344 } __attribute__((packed));
345 } __attribute__((packed));
347 // MSR_PERF_STAT_IA32 r
348 struct perf_stat_reg_intel {
352 // this is the current
353 uint16_t pstate : 16;
354 uint64_t reserved : 48;
356 } __attribute__((packed));
357 } __attribute__((packed));
359 // MSR_ENERGY_PERF_BIAS_IA32 r/w
360 struct enery_perf_bias_reg_intel {
364 // this is the current
365 uint8_t policy_hint : 4;
366 uint64_t reserved : 60;
368 } __attribute__((packed));
369 } __attribute__((packed));
372 struct turbo_mode_info_reg_intel {
377 uint8_t max_noturbo_ratio : 8;
379 uint8_t ppin_cap : 1;
381 uint8_t ratio_limit : 1;
382 uint8_t tdc_tdp_limit : 1;
384 uint8_t min_ratio : 8;
387 } __attribute__((packed));
388 } __attribute__((packed));
390 // This replicates the critical information in Linux's struct acpi_processor_px
391 // To make it easier to port to other OSes.
392 struct intel_pstate_info {
393 uint64_t freq; // KHz
394 uint64_t ctrl; // What to write into the _CTL MSR to get this
397 // The internal array will be used if we cannot build the table locally
398 static struct intel_pstate_info *intel_pstate_to_ctrl_internal=0;
399 static int intel_num_pstates_internal=0;
401 // These will either point to the internal array or to a constructed array
402 static struct intel_pstate_info *intel_pstate_to_ctrl=0;
403 static int intel_num_pstates=0;
406 /* CPUID.01:ECX.AES(7) */
407 static uint8_t supports_pstates_intel(void)
409 /* NOTE: CPUID.06H:ECX.SETBH[bit 3] is set and it also implies the presence of a new architectural MSR called IA32_ENERGY_PERF_BIAS (1B0H).
411 uint32_t eax, ebx, ecx, edx;
413 cpuid(0x1, &eax, &ebx, &ecx, &edx);
414 machine_state.have_speedstep = !!(ecx & (1 << 7));
416 cpuid(0x6, &eax, &ebx, &ecx, &edx);
417 machine_state.have_pstate_hw_coord = !!(ecx & 1); // ?
418 machine_state.have_opportunistic = !!(eax & 1<<1);
419 machine_state.have_policy_hint = !!(ecx & 1<<3);
420 machine_state.have_hwp = !!(eax & 1<<7);
421 machine_state.have_hdc = !!(eax & 1<<13);
423 cpuid(0x5, &eax, &ebx, &ecx, &edx);
424 machine_state.have_mwait_ext = !!(ecx & 1);
425 machine_state.have_mwait_int = !!(ecx & 1<<1);
428 INFO("P-State: Intel: Speedstep=%d, PstateHWCoord=%d, Opportunistic=%d PolicyHint=%d HWP=%d HDC=%d, MwaitExt=%d MwaitInt=%d \n",
429 machine_state.have_speedstep,
430 machine_state.have_pstate_hw_coord,
431 machine_state.have_opportunistic,
432 machine_state.have_policy_hint,
433 machine_state.have_hwp,
434 machine_state.have_hdc,
435 machine_state.have_mwait_ext,
436 machine_state.have_mwait_int );
439 if (machine_state.have_speedstep) {
441 // Build mapping table (from "pstate" (0..) to ctrl value for MSR
442 if (!(get_cpu_var(processors)) || !(get_cpu_var(processors)->performance) ) {
443 put_cpu_var(processors);
444 // no acpi... revert to internal table
445 intel_pstate_to_ctrl=intel_pstate_to_ctrl_internal;
446 intel_num_pstates=intel_num_pstates_internal;
448 intel_num_pstates = get_cpu_var(processors)->performance->state_count;
449 if (intel_num_pstates) {
450 intel_pstate_to_ctrl = palacios_alloc(sizeof(struct intel_pstate_info)*intel_num_pstates);
451 if (!intel_pstate_to_ctrl) {
452 ERROR("P-State: Cannot allocate space for mapping...\n");
455 for (i=0;i<intel_num_pstates;i++) {
456 intel_pstate_to_ctrl[i].freq = get_cpu_var(processors)->performance->states[i].core_frequency*1000;
457 intel_pstate_to_ctrl[i].ctrl = get_cpu_var(processors)->performance->states[i].control;
461 ERROR("P-State: Strange, machine has ACPI DVFS but no states...\n");
464 put_cpu_var(processors);
465 INFO("P-State: Intel - State Mapping (%u states) follows\n",intel_num_pstates);
466 for (i=0;i<intel_num_pstates;i++) {
467 INFO("P-State: Intel Mapping %u: freq=%llu ctrl=%llx\n",
468 i, intel_pstate_to_ctrl[i].freq,intel_pstate_to_ctrl[i].ctrl);
471 INFO("P-State: Intel: No speedstep here\n");
475 return machine_state.have_speedstep;
479 static void init_arch_intel(void)
483 rdmsrl(MSR_MISC_ENABLE_IA32, val);
485 //INFO("P-State: prior ENABLE=%llx\n",val);
487 // store prior speedstep setting
488 get_cpu_var(core_state).prior_speedstep=(val >> 16) & 0x1;
489 put_cpu_var(core_state);
491 // enable speedstep (probably already on)
493 wrmsrl(MSR_MISC_ENABLE_IA32, val);
495 //INFO("P-State: write ENABLE=%llx\n",val);
499 static void deinit_arch_intel(void)
503 rdmsrl(MSR_MISC_ENABLE_IA32, val);
505 //INFO("P-State: deinit: ENABLE=%llx\n",val);
507 val &= ~(1ULL << 16);
508 val |= get_cpu_var(core_state).prior_speedstep << 16;
509 put_cpu_var(core_state);
511 wrmsrl(MSR_MISC_ENABLE_IA32, val);
513 //INFO("P-state: deinit ENABLE=%llx\n",val);
517 /* TODO: Intel P-states require sampling at intervals... */
518 static uint64_t get_pstate_intel(void)
522 rdmsrl(MSR_PERF_STAT_IA32,val);
524 //INFO("P-State: Get: 0x%llx\n", val);
526 // should check if turbo is active, in which case
527 // this value is not the whole story
532 static void set_pstate_intel(uint64_t p)
537 if (intel_num_pstates==0) {
540 if (p>=intel_num_pstates) {
541 p=intel_num_pstates-1;
545 ctrl=intel_pstate_to_ctrl[p].ctrl;
547 /* ...Intel IDA (dynamic acceleration)
548 if (c->no_turbo && !c->turbo_disabled) {
552 // leave all bits along expect for the likely
555 rdmsrl(MSR_PERF_CTL_IA32, val);
556 INFO("P-State: Pre-Set: 0x%llx\n", val);
559 val |= ctrl & 0xffffULL;
561 INFO("P-State: Set: 0x%llx\n", val);
563 wrmsrl(MSR_PERF_CTL_IA32, val);
565 get_cpu_var(core_state).cur_pstate = p;
566 put_cpu_var(core_state);
570 static uint64_t get_min_pstate_intel(void)
577 static uint64_t get_max_pstate_intel (void)
579 if (intel_num_pstates==0) {
582 return intel_num_pstates-1;
586 static struct pstate_core_funcs intel_funcs =
588 .arch_init = init_arch_intel,
589 .arch_deinit = deinit_arch_intel,
590 .get_pstate = get_pstate_intel,
591 .set_pstate = set_pstate_intel,
592 .get_max_pstate = get_max_pstate_intel,
593 .get_min_pstate = get_min_pstate_intel,
598 /***********************************************
599 Arch determination and setup
600 ***********************************************/
602 static inline void cpuid_string (uint32_t id, uint32_t dest[4])
605 :"=a"(*dest),"=b"(*(dest+1)),"=c"(*(dest+2)),"=d"(*(dest+3))
610 static int get_cpu_vendor (char name[13])
615 cpuid_string(0,dest);
617 ((uint32_t*)name)[0]=dest[1];
618 ((uint32_t*)name)[1]=dest[3];
619 ((uint32_t*)name)[2]=dest[2];
626 static int is_intel (void)
629 get_cpu_vendor(name);
630 return !strcmp(name,"GenuineIntel");
634 static int is_amd (void)
637 get_cpu_vendor(name);
638 return !strcmp(name,"AuthenticAMD");
641 static int pstate_arch_setup(void)
645 machine_state.arch = AMD;
646 machine_state.funcs = &amd_funcs;
647 machine_state.supports_pstates = supports_pstates_amd();
648 INFO("PSTATE: P-State initialized for AMD\n");
649 } else if (is_intel()) {
650 machine_state.arch = INTEL;
651 machine_state.funcs = &intel_funcs;
652 machine_state.supports_pstates = supports_pstates_intel();
653 INFO("PSTATE: P-State initialized for INTEL (Work in progress...)\n");
657 machine_state.arch = OTHER;
658 machine_state.funcs = NULL;
659 machine_state.supports_pstates = 0;
660 INFO("PSTATE: P-state control: No support for direct control on this architecture\n");
669 /******************************************************************
671 *****************************************************************/
673 static unsigned cpus_using_v3_governor;
674 static DEFINE_MUTEX(v3_governor_mutex);
676 /* KCH: this will tell us when there is an actual frequency transition */
677 static int v3_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
680 struct cpufreq_freqs *freq = data;
682 if (per_cpu(core_state, freq->cpu).mode != V3_PSTATE_EXTERNAL_CONTROL) {
686 if (val == CPUFREQ_POSTCHANGE) {
687 DEBUG("P-State: frequency change took effect on cpu %u (now %u kHz)\n",
688 freq->cpu, freq->new);
689 per_cpu(core_state, freq->cpu).cur_freq_khz = freq->new;
697 static struct notifier_block v3_cpufreq_notifier_block = {
698 .notifier_call = v3_cpufreq_notifier
703 * This stub governor is simply a placeholder for preventing
704 * frequency changes from the Linux side. For now, we simply leave
705 * the frequency as is when we acquire control.
707 static int governor_run(struct cpufreq_policy *policy, unsigned int event)
709 unsigned cpu = policy->cpu;
712 /* we can't use cpufreq_driver_target here as it can result
713 * in a circular dependency, so we'll keep the current frequency as is
715 case CPUFREQ_GOV_START:
716 BUG_ON(!policy->cur);
718 mutex_lock(&v3_governor_mutex);
720 if (cpus_using_v3_governor == 0) {
721 cpufreq_register_notifier(&v3_cpufreq_notifier_block,
722 CPUFREQ_TRANSITION_NOTIFIER);
725 cpus_using_v3_governor++;
727 per_cpu(core_state, cpu).set_freq_khz = policy->cur;
728 per_cpu(core_state, cpu).cur_freq_khz = policy->cur;
729 per_cpu(core_state, cpu).max_freq_khz = policy->max;
730 per_cpu(core_state, cpu).min_freq_khz = policy->min;
732 mutex_unlock(&v3_governor_mutex);
734 case CPUFREQ_GOV_STOP:
735 mutex_lock(&v3_governor_mutex);
737 cpus_using_v3_governor--;
739 if (cpus_using_v3_governor == 0) {
740 cpufreq_unregister_notifier(
741 &v3_cpufreq_notifier_block,
742 CPUFREQ_TRANSITION_NOTIFIER);
745 per_cpu(core_state, cpu).set_freq_khz = 0;
746 per_cpu(core_state, cpu).cur_freq_khz = 0;
747 per_cpu(core_state, cpu).max_freq_khz = 0;
748 per_cpu(core_state, cpu).min_freq_khz = 0;
750 mutex_unlock(&v3_governor_mutex);
752 case CPUFREQ_GOV_LIMITS:
756 ERROR("Undefined governor command (%u)\n", event);
764 static struct cpufreq_governor stub_governor =
766 .name = PALACIOS_GOVNAME,
767 .governor = governor_run,
768 .owner = THIS_MODULE,
772 static struct workqueue_struct *pstate_wq;
775 struct work_struct work;
781 static inline void pstate_register_linux_governor(void)
783 cpufreq_register_governor(&stub_governor);
787 static inline void pstate_unregister_linux_governor(void)
789 cpufreq_unregister_governor(&stub_governor);
793 static int pstate_linux_init(void)
795 pstate_register_linux_governor();
796 pstate_wq = create_workqueue("v3vee_pstate_wq");
798 ERROR("Could not create work queue\n");
805 pstate_unregister_linux_governor();
810 static void pstate_linux_deinit(void)
812 pstate_unregister_linux_governor();
813 flush_workqueue(pstate_wq);
814 destroy_workqueue(pstate_wq);
818 static int get_current_governor(char **buf, unsigned int cpu)
820 struct cpufreq_policy * policy = palacios_alloc(sizeof(struct cpufreq_policy));
821 char * govname = NULL;
824 ERROR("could not allocate cpufreq_policy\n");
828 if (cpufreq_get_policy(policy, cpu) != 0) {
829 ERROR("Could not get current cpufreq policy\n");
833 /* We're in interrupt context, should probably not wait here */
834 govname = palacios_alloc(MAX_GOV_NAME_LEN);
836 ERROR("Could not allocate space for governor name\n");
840 strncpy(govname, policy->governor->name, MAX_GOV_NAME_LEN);
842 get_cpu_var(core_state).linux_governor = govname;
843 put_cpu_var(core_state);
847 palacios_free(policy);
852 palacios_free(policy);
857 /* passed to the userspacehelper interface for cleanup */
858 static void gov_switch_cleanup(struct subprocess_info * s)
860 palacios_free(s->argv[2]);
861 palacios_free(s->argv);
867 * @s - the governor to switch to
868 * TODO: this should probably be submitted to a work queue
869 * so we don't have to run it in interrupt context
871 static int governor_switch(char * s, unsigned int cpu)
873 char * path_str = NULL;
876 static char * envp[] = {
879 "PATH=/sbin:/bin:/usr/sbin:/usr/bin", NULL };
882 argv = palacios_alloc(4*sizeof(char*));
884 ERROR("Couldn't allocate argv struct\n");
888 path_str = palacios_alloc(MAX_PATH_LEN);
890 ERROR("Couldn't allocate path string\n");
893 memset(path_str, 0, MAX_PATH_LEN);
895 snprintf(path_str, MAX_PATH_LEN, "echo %s > /sys/devices/system/cpu/cpu%u/cpufreq/scaling_governor", s, cpu);
902 /* KCH: we can't wait here to actually see if we succeeded, we're in interrupt context */
903 return call_usermodehelper_fns("/bin/sh", argv, envp, UMH_NO_WAIT, NULL, gov_switch_cleanup, NULL);
911 static inline void free_linux_governor(void)
913 palacios_free(get_cpu_var(core_state).linux_governor);
914 put_cpu_var(core_state);
918 static int linux_setup_palacios_governor(void)
921 unsigned int cpu = get_cpu();
924 /* KCH: we assume the v3vee governor is already
925 * registered with kernel by this point
928 if (get_current_governor(&gov, cpu) < 0) {
929 ERROR("Could not get current governor\n");
933 DEBUG("saving current governor (%s)\n", gov);
935 get_cpu_var(core_state).linux_governor = gov;
936 put_cpu_var(core_state);
938 DEBUG("setting the new governor (%s)\n", PALACIOS_GOVNAME);
940 /* set the new one to ours */
942 if (governor_switch(PALACIOS_GOVNAME, cpu) < 0) {
943 ERROR("Could not set governor to (%s)\n", PALACIOS_GOVNAME);
952 static int linux_get_pstate(void)
954 struct cpufreq_policy * policy = NULL;
955 struct cpufreq_frequency_table *table;
957 unsigned int count = 0;
958 unsigned int cpu = get_cpu();
962 policy = palacios_alloc(sizeof(struct cpufreq_policy));
964 ERROR("Could not allocate policy struct\n");
968 cpufreq_get_policy(policy, cpu);
969 table = cpufreq_frequency_get_table(cpu);
971 for (i = 0; (table[i].frequency != CPUFREQ_TABLE_END); i++) {
973 if (table[i].frequency == CPUFREQ_ENTRY_INVALID) {
977 if (table[i].frequency == policy->cur) {
984 palacios_free(policy);
991 static int linux_get_freq(void)
993 struct cpufreq_policy * policy = NULL;
994 unsigned int cpu = get_cpu();
997 policy = palacios_alloc(sizeof(struct cpufreq_policy));
999 ERROR("Could not allocate policy struct\n");
1003 if (cpufreq_get_policy(policy, cpu)) {
1004 ERROR("Could not get current policy\n");
1012 pstate_switch_workfn (struct work_struct *work)
1014 pstate_work_t * pwork = (pstate_work_t*)work;
1015 struct cpufreq_policy * policy = NULL;
1017 unsigned int cpu = get_cpu();
1020 mutex_lock(&v3_governor_mutex);
1022 policy = palacios_alloc(sizeof(struct cpufreq_policy));
1024 ERROR("Could not allocate space for cpufreq policy\n");
1028 if (cpufreq_get_policy(policy, cpu) != 0) {
1029 ERROR("Could not get cpufreq policy\n");
1034 get_cpu_var(core_state).set_freq_khz = freq;
1036 if (freq < get_cpu_var(core_state).min_freq_khz) {
1037 freq = get_cpu_var(core_state).min_freq_khz;
1039 if (freq > get_cpu_var(core_state).max_freq_khz) {
1040 freq = get_cpu_var(core_state).max_freq_khz;
1042 put_cpu_var(core_state);
1044 INFO("P-state: requesting frequency change on core %u to %llu\n", cpu, freq);
1045 __cpufreq_driver_target(policy, freq, CPUFREQ_RELATION_L);
1048 palacios_free(policy);
1050 palacios_free(work);
1051 mutex_unlock(&v3_governor_mutex);
1055 static int linux_set_pstate(uint8_t p)
1057 struct cpufreq_policy * policy = NULL;
1058 struct cpufreq_frequency_table *table;
1059 pstate_work_t * work = NULL;
1061 unsigned int count = 0;
1064 unsigned int cpu = get_cpu();
1067 policy = palacios_alloc(sizeof(struct cpufreq_policy));
1069 ERROR("Could not allocate policy struct\n");
1073 work = (pstate_work_t*)palacios_alloc(sizeof(pstate_work_t));
1075 ERROR("Could not allocate work struct\n");
1079 if (cpufreq_get_policy(policy, cpu)) {
1080 ERROR("Could not get current policy\n");
1083 table = cpufreq_frequency_get_table(cpu);
1085 for (i = 0; (table[i].frequency != CPUFREQ_TABLE_END); i++) {
1087 if (table[i].frequency == CPUFREQ_ENTRY_INVALID) {
1093 INIT_WORK((struct work_struct*)work, pstate_switch_workfn);
1094 work->freq = table[i].frequency;
1095 queue_work(pstate_wq, (struct work_struct*)work);
1105 /* we need to deal with the case in which we get a number > max pstate */
1107 INIT_WORK((struct work_struct*)work, pstate_switch_workfn);
1108 work->freq = table[last_valid].frequency;
1109 queue_work(pstate_wq, (struct work_struct*)work);
1112 palacios_free(policy);
1116 palacios_free(work);
1118 palacios_free(policy);
1123 static int linux_set_freq(uint64_t f)
1125 struct cpufreq_policy * policy = NULL;
1126 pstate_work_t * work = NULL;
1128 unsigned int cpu = get_cpu();
1131 policy = palacios_alloc(sizeof(struct cpufreq_policy));
1133 ERROR("Could not allocate policy struct\n");
1137 work = (pstate_work_t*)palacios_alloc(sizeof(pstate_work_t));
1139 ERROR("Could not allocate work struct\n");
1143 if (cpufreq_get_policy(policy, cpu) != 0) {
1144 ERROR("Could not get cpufreq policy\n");
1148 if (f < policy->min) {
1150 } else if (f > policy->max) {
1156 INIT_WORK((struct work_struct*)work, pstate_switch_workfn);
1158 queue_work(pstate_wq, (struct work_struct*)work);
1160 palacios_free(policy);
1164 palacios_free(work);
1166 palacios_free(policy);
1171 static int linux_restore_defaults(void)
1174 unsigned int cpu = get_cpu();
1177 gov = get_cpu_var(core_state).linux_governor;
1178 put_cpu_var(core_state);
1180 DEBUG("restoring previous governor (%s)\n", gov);
1182 if (governor_switch(gov, cpu) < 0) {
1183 ERROR("Could not restore governor to (%s)\n", gov);
1187 free_linux_governor();
1191 free_linux_governor();
1197 /******************************************************************
1198 Generic Interface as provided to Palacios and to the rest of the
1200 ******************************************************************/
1202 static void init_core(void)
1205 struct cpufreq_policy *p;
1209 DEBUG("P-State Core Init\n");
1211 get_cpu_var(core_state).mode = V3_PSTATE_HOST_CONTROL;
1212 get_cpu_var(core_state).cur_pstate = 0;
1214 if (machine_state.funcs) {
1215 get_cpu_var(core_state).min_pstate = machine_state.funcs->get_min_pstate();
1216 get_cpu_var(core_state).max_pstate = machine_state.funcs->get_max_pstate();
1218 get_cpu_var(core_state).min_pstate = 0;
1219 get_cpu_var(core_state).max_pstate = 0;
1223 cpu = get_cpu(); put_cpu();
1225 p = cpufreq_cpu_get(cpu);
1228 get_cpu_var(core_state).have_cpufreq = 0;
1229 get_cpu_var(core_state).min_freq_khz=0;
1230 get_cpu_var(core_state).max_freq_khz=0;
1231 get_cpu_var(core_state).cur_freq_khz=0;
1233 get_cpu_var(core_state).have_cpufreq = 1;
1234 get_cpu_var(core_state).min_freq_khz=p->min;
1235 get_cpu_var(core_state).max_freq_khz=p->max;
1236 get_cpu_var(core_state).cur_freq_khz=p->cur; } cpufreq_cpu_put(p);
1237 put_cpu_var(core_state);
1239 for (i=0;i<get_cpu_var(processors)->performance->state_count; i++) {
1240 INFO("P-State: %u: freq=%llu ctrl=%llx",
1242 get_cpu_var(processors)->performance->states[i].core_frequency*1000,
1243 get_cpu_var(processors)->performance->states[i].control);
1245 put_cpu_var(processors);
1249 void palacios_pstate_ctrl_release(void);
1252 static void deinit_core(void)
1254 DEBUG("P-State Core Deinit\n");
1255 palacios_pstate_ctrl_release();
1261 void palacios_pstate_ctrl_get_chars(struct v3_cpu_pstate_chars *c)
1263 memset(c,0,sizeof(struct v3_cpu_pstate_chars));
1266 c->features = V3_PSTATE_INTERNAL_CONTROL;
1268 if (get_cpu_var(core_state).have_cpufreq) {
1269 c->features |= V3_PSTATE_EXTERNAL_CONTROL;
1272 if (machine_state.arch==AMD || machine_state.arch==INTEL) {
1273 c->features |= V3_PSTATE_DIRECT_CONTROL;
1275 c->cur_mode = get_cpu_var(core_state).mode;
1276 c->min_pstate = get_cpu_var(core_state).min_pstate;
1277 c->max_pstate = get_cpu_var(core_state).max_pstate;
1278 c->cur_pstate = get_cpu_var(core_state).cur_pstate;
1279 c->min_freq_khz = get_cpu_var(core_state).min_freq_khz;
1280 c->max_freq_khz = get_cpu_var(core_state).max_freq_khz;
1281 c->cur_freq_khz = get_cpu_var(core_state).cur_freq_khz;
1283 put_cpu_var(core_state);
1290 uint64_t palacios_pstate_ctrl_get_pstate(void)
1292 if (get_cpu_var(core_state).mode==V3_PSTATE_DIRECT_CONTROL) {
1293 put_cpu_var(core_state);
1294 return machine_state.funcs->get_pstate();
1295 } else if (get_cpu_var(core_state).mode==V3_PSTATE_EXTERNAL_CONTROL) {
1296 put_cpu_var(core_state);
1297 return linux_get_pstate();
1299 put_cpu_var(core_state);
1305 void palacios_pstate_ctrl_set_pstate(uint64_t p)
1307 if (get_cpu_var(core_state).mode==V3_PSTATE_DIRECT_CONTROL) {
1308 put_cpu_var(core_state);
1309 machine_state.funcs->set_pstate(p);
1310 } else if (get_cpu_var(core_state).mode==V3_PSTATE_EXTERNAL_CONTROL) {
1311 put_cpu_var(core_state);
1312 linux_set_pstate(p);
1314 put_cpu_var(core_state);
1319 void palacios_pstate_ctrl_set_pstate_wrapper(void *p)
1321 palacios_pstate_ctrl_set_pstate((uint8_t)(uint64_t)p);
1325 uint64_t palacios_pstate_ctrl_get_freq(void)
1327 if (get_cpu_var(core_state).mode==V3_PSTATE_EXTERNAL_CONTROL) {
1328 put_cpu_var(core_state);
1329 return linux_get_freq();
1331 put_cpu_var(core_state);
1337 void palacios_pstate_ctrl_set_freq(uint64_t p)
1339 if (get_cpu_var(core_state).mode==V3_PSTATE_EXTERNAL_CONTROL) {
1340 put_cpu_var(core_state);
1343 put_cpu_var(core_state);
1348 static int switch_to_external(void)
1350 DEBUG("switch from host control to external\n");
1352 if (!(get_cpu_var(core_state).have_cpufreq)) {
1353 put_cpu_var(core_state);
1354 ERROR("No cpufreq - cannot switch to external...\n");
1357 put_cpu_var(core_state);
1359 linux_setup_palacios_governor();
1361 get_cpu_var(core_state).mode=V3_PSTATE_EXTERNAL_CONTROL;
1362 put_cpu_var(core_state);
1368 static int switch_to_direct(void)
1370 DEBUG("switch from host control to direct\n");
1372 if (get_cpu_var(core_state).have_cpufreq) {
1373 put_cpu_var(core_state);
1374 DEBUG("switch to direct from cpufreq\n");
1376 // The implementation would set the policy and governor to peg cpu
1377 // regardless of load
1378 linux_setup_palacios_governor();
1380 put_cpu_var(core_state);
1383 if (machine_state.funcs && machine_state.funcs->arch_init) {
1384 get_cpu_var(core_state).mode=V3_PSTATE_DIRECT_CONTROL;
1386 machine_state.funcs->arch_init();
1388 put_cpu_var(core_state);
1395 static int switch_to_internal(void)
1397 DEBUG("switch from host control to internal\n");
1399 if (get_cpu_var(core_state).have_cpufreq) {
1400 put_cpu_var(core_state);
1401 DEBUG("switch to internal on machine with cpu freq\n");
1402 linux_setup_palacios_governor();
1404 put_cpu_var(core_state);
1407 get_cpu_var(core_state).mode=V3_PSTATE_INTERNAL_CONTROL;
1409 put_cpu_var(core_state);
1415 static int switch_from_external(void)
1417 if (!(get_cpu_var(core_state).have_cpufreq)) {
1418 put_cpu_var(core_state);
1419 ERROR("No cpufreq - how did we get here... external...\n");
1422 put_cpu_var(core_state);
1424 DEBUG("Switching back to host control from external\n");
1426 if (get_cpu_var(core_state).have_cpufreq) {
1427 put_cpu_var(core_state);
1428 linux_restore_defaults();
1430 put_cpu_var(core_state);
1433 get_cpu_var(core_state).mode = V3_PSTATE_HOST_CONTROL;
1434 put_cpu_var(core_state);
1440 static int switch_from_direct(void)
1443 DEBUG("Switching back to host control from direct\n");
1445 // Set maximum performance, just in case there is no host control
1446 machine_state.funcs->set_pstate(get_cpu_var(core_state).min_pstate);
1447 machine_state.funcs->arch_deinit();
1449 if (get_cpu_var(core_state).have_cpufreq) {
1450 put_cpu_var(core_state);
1451 linux_restore_defaults();
1453 put_cpu_var(core_state);
1456 get_cpu_var(core_state).mode=V3_PSTATE_HOST_CONTROL;
1458 put_cpu_var(core_state);
1464 static int switch_from_internal(void)
1466 DEBUG("Switching back to host control from internal\n");
1468 if (get_cpu_var(core_state).have_cpufreq) {
1469 put_cpu_var(core_state);
1470 // ERROR("Unimplemented: switch from internal on machine with cpu freq - will just pretend to do so\n");
1471 // The implementation would switch back to default policy and governor
1472 linux_restore_defaults();
1474 put_cpu_var(core_state);
1477 get_cpu_var(core_state).mode=V3_PSTATE_HOST_CONTROL;
1479 put_cpu_var(core_state);
1486 void palacios_pstate_ctrl_acquire(uint32_t type)
1488 if (get_cpu_var(core_state).mode != V3_PSTATE_HOST_CONTROL) {
1489 put_cpu_var(core_state);
1490 palacios_pstate_ctrl_release();
1492 put_cpu_var(core_state);
1496 case V3_PSTATE_EXTERNAL_CONTROL:
1497 switch_to_external();
1499 case V3_PSTATE_DIRECT_CONTROL:
1502 case V3_PSTATE_INTERNAL_CONTROL:
1503 switch_to_internal();
1506 ERROR("Unknown pstate control type %u\n",type);
1512 // Wrappers for xcalls
1513 static void palacios_pstate_ctrl_acquire_external(void)
1515 palacios_pstate_ctrl_acquire(V3_PSTATE_EXTERNAL_CONTROL);
1518 static void palacios_pstate_ctrl_acquire_direct(void)
1520 palacios_pstate_ctrl_acquire(V3_PSTATE_DIRECT_CONTROL);
1524 void palacios_pstate_ctrl_release(void)
1526 if (get_cpu_var(core_state).mode == V3_PSTATE_HOST_CONTROL) {
1527 put_cpu_var(core_state);
1530 put_cpu_var(core_state);
1532 switch (get_cpu_var(core_state).mode) {
1533 case V3_PSTATE_EXTERNAL_CONTROL:
1534 put_cpu_var(core_state);
1535 switch_from_external();
1537 case V3_PSTATE_DIRECT_CONTROL:
1538 put_cpu_var(core_state);
1539 switch_from_direct();
1541 case V3_PSTATE_INTERNAL_CONTROL:
1542 put_cpu_var(core_state);
1543 switch_from_internal();
1546 put_cpu_var(core_state);
1547 ERROR("Unknown pstate control type %u\n",core_state.mode);
1553 static void update_hw_pstate(void *arg)
1555 if (machine_state.funcs && machine_state.funcs->get_pstate) {
1556 get_cpu_var(core_state).cur_hw_pstate = machine_state.funcs->get_pstate();
1557 put_cpu_var(core_state);
1559 get_cpu_var(core_state).cur_hw_pstate = 0;
1560 put_cpu_var(core_state);
1565 /***************************************************************************
1566 PROC Interface to expose state
1567 ***************************************************************************/
1569 static int pstate_show(struct seq_file * file, void * v)
1572 unsigned int numcpus = num_online_cpus();
1574 seq_printf(file, "V3VEE DVFS Status\n\n");
1576 for (cpu=0;cpu<numcpus;cpu++) {
1577 palacios_xcall(cpu,update_hw_pstate,0);
1580 seq_printf(file, "Arch:\t%s\nPStates:\t%s\n\n",
1581 machine_state.arch==INTEL ? "Intel" :
1582 machine_state.arch==AMD ? "AMD" : "Other",
1583 machine_state.supports_pstates ? "Yes" : "No");
1585 for (cpu=0;cpu<numcpus;cpu++) {
1586 struct pstate_core_info *s = &per_cpu(core_state,cpu);
1587 seq_printf(file,"pcore %u: hw pstate 0x%x mode %s of [ host ",cpu,
1589 s->mode==V3_PSTATE_HOST_CONTROL ? "host" :
1590 s->mode==V3_PSTATE_EXTERNAL_CONTROL ? "external" :
1591 s->mode==V3_PSTATE_DIRECT_CONTROL ? "direct" :
1592 s->mode==V3_PSTATE_INTERNAL_CONTROL ? "internal" : "UNKNOWN");
1593 if (s->have_cpufreq) {
1594 seq_printf(file,"external ");
1596 if (machine_state.supports_pstates) {
1597 seq_printf(file,"direct ");
1599 seq_printf(file,"internal ] ");
1600 if (s->mode==V3_PSTATE_EXTERNAL_CONTROL) {
1601 seq_printf(file,"(min=%llu max=%llu cur=%llu) ", s->min_freq_khz, s->max_freq_khz, s->cur_freq_khz);
1603 if (s->mode==V3_PSTATE_DIRECT_CONTROL) {
1604 seq_printf(file,"(min=%u max=%u cur=%u) ", (uint32_t)s->min_pstate, (uint32_t)s->max_pstate, (uint32_t)s->cur_pstate);
1606 seq_printf(file,"\n");
1611 static int pstate_open(struct inode * inode, struct file * file)
1613 return single_open(file, pstate_show, NULL);
1617 static struct file_operations pstate_fops = {
1618 .owner = THIS_MODULE,
1619 .open = pstate_open,
1621 .llseek = seq_lseek,
1622 .release = seq_release
1625 int pstate_proc_setup(void)
1627 struct proc_dir_entry *proc;
1629 proc = create_proc_entry("v3-dvfs",0444, palacios_get_procdir());
1632 ERROR("Failed to create proc entry for p-state control\n");
1636 proc->proc_fops = &pstate_fops;
1641 void pstate_proc_teardown(void)
1643 remove_proc_entry("v3-dvfs",palacios_get_procdir());
1646 /********************************************************************
1647 User interface (ioctls)
1648 ********************************************************************/
1650 static int dvfs_ctrl(unsigned int cmd, unsigned long arg)
1652 struct v3_dvfs_ctrl_request r;
1654 if (copy_from_user(&r,(void __user*)arg,sizeof(struct v3_dvfs_ctrl_request))) {
1655 ERROR("Failed to copy DVFS request from user\n");
1659 if (r.pcore >= num_online_cpus()) {
1660 ERROR("Cannot apply DVFS request to pcore %u\n",r.pcore);
1665 case V3_DVFS_ACQUIRE: {
1666 switch (r.acq_type) {
1667 case V3_DVFS_EXTERNAL:
1668 palacios_xcall(r.pcore,(void (*)(void*))palacios_pstate_ctrl_acquire_external, NULL);
1671 case V3_DVFS_DIRECT:
1672 palacios_xcall(r.pcore,(void (*)(void*))palacios_pstate_ctrl_acquire_direct, NULL);
1676 ERROR("Unknown DVFS acquire type %u\n",r.acq_type);
1681 case V3_DVFS_RELEASE: {
1682 palacios_xcall(r.pcore,(void (*)(void*))palacios_pstate_ctrl_release, NULL);
1686 case V3_DVFS_SETFREQ: {
1687 palacios_xcall(r.pcore,(void (*)(void*))palacios_pstate_ctrl_set_freq,(void*)r.freq_khz);
1691 case V3_DVFS_SETPSTATE: {
1692 palacios_xcall(r.pcore,(void (*)(void*))palacios_pstate_ctrl_set_pstate_wrapper,(void*)(uint64_t)r.pstate);
1696 ERROR("Unknown DVFS command %u\n",r.cmd);
1704 void pstate_user_setup(void)
1706 add_global_ctrl(V3_DVFS_CTRL, dvfs_ctrl);
1710 void pstate_user_teardown(void)
1712 remove_global_ctrl(V3_DVFS_CTRL);
1715 static struct v3_host_pstate_ctrl_iface hooks = {
1716 .get_chars = palacios_pstate_ctrl_get_chars,
1717 .acquire = palacios_pstate_ctrl_acquire,
1718 .release = palacios_pstate_ctrl_release,
1719 .set_pstate = palacios_pstate_ctrl_set_pstate,
1720 .get_pstate = palacios_pstate_ctrl_get_pstate,
1721 .set_freq = palacios_pstate_ctrl_set_freq,
1722 .get_freq = palacios_pstate_ctrl_get_freq,
1727 static int pstate_ctrl_init(void)
1730 unsigned int numcpus = num_online_cpus();
1732 pstate_arch_setup();
1734 for (cpu=0;cpu<numcpus;cpu++) {
1735 palacios_xcall(cpu,(void ((*)(void*)))init_core,0);
1738 V3_Init_Pstate_Ctrl(&hooks);
1740 if (pstate_proc_setup()) {
1741 ERROR("Unable to initialize P-State Control\n");
1745 pstate_user_setup();
1747 pstate_linux_init();
1749 INFO("P-State Control Initialized\n");
1754 static int pstate_ctrl_deinit(void)
1757 unsigned int numcpus=num_online_cpus();
1759 pstate_linux_deinit();
1761 pstate_user_teardown();
1763 pstate_proc_teardown();
1765 // release pstate control if we have it, and we need to do this on each processor
1766 for (cpu=0;cpu<numcpus;cpu++) {
1767 palacios_xcall(cpu,(void (*)(void *))deinit_core,0);
1771 // Free any mapping table we built for Intel
1772 if (intel_pstate_to_ctrl && intel_pstate_to_ctrl != intel_pstate_to_ctrl_internal) {
1773 palacios_free(intel_pstate_to_ctrl);
1781 static struct linux_ext pstate_ext = {
1782 .name = "PSTATE_CTRL",
1783 .init = pstate_ctrl_init,
1784 .deinit = pstate_ctrl_deinit,
1786 .guest_deinit = NULL,
1790 register_extension(&pstate_ext);