X-Git-Url: http://v3vee.org/palacios/gitweb/gitweb.cgi?p=palacios.git;a=blobdiff_plain;f=linux_module%2Fiface-pstate-ctrl.c;h=2c49be919c143b82ff3973f3f505c2841a6d9395;hp=3831ed0d477d33a36aede88377ffac5d74969885;hb=c8b23e99efde3aa5a2c26d1b8e9bc7dc914e6113;hpb=fd288e4dc51177f037f4752861eb95971fb1d1a0 diff --git a/linux_module/iface-pstate-ctrl.c b/linux_module/iface-pstate-ctrl.c index 3831ed0..2c49be9 100644 --- a/linux_module/iface-pstate-ctrl.c +++ b/linux_module/iface-pstate-ctrl.c @@ -22,10 +22,19 @@ #include #include #include +#include +#include +#include +#include +#include #include #include #include +// Used to determine the appropriate pstates values on Intel +#include +#include + #include #include "palacios.h" @@ -34,20 +43,40 @@ #include "linux-exts.h" /* - This P-STATE control implementation includes: - - - Direct control of Intel and AMD processor pstates - - External control of processor states via Linux (unimplemented) - - Internal control of processor states in Palacios (handoff from Linux) - - Additionally, it provides a user-space interface for manipulating - p-state regardless of the host's functionality. This includes - an ioctl for commanding the implementation and a /proc file for - showing current status and capabilities. + This P-STATE control implementation includes the following modes. + You can switch between modes at any time. + + - Internal control of processor states in Palacios (handoff from Linux) + When Palacios acuires this control, this module disables Linux cpufreq control + and allows code within Palacios unfettered access to the DVFS hardware. + - Direct control of Intel and AMD processor pstates using code in this module + When you acquire this control, this module disables Linux cpufreq control + and directly programs the processor itself in response to your requests + - External control of processor states via Linux + When you acuire this control, this module uses the Linux cpufreq control + to program the processor on your behelf + - Host control of processor stastes + This is the normal mode of DVFS control (e.g., Linux cpufreq) + + Additionally, it provides a user-space interface for manipulating + p-state regardless of the host's functionality. This includes + an ioctl for commanding the implementation and a /proc file for + showing current status and capabilities. From user space, you can + use the Direct, External, and Host modes. + + What we mean by "p-state" here is the processor's internal + configuration. For AMD, this is defined as being the same as + the ACPI-defined p-state. For Intel, it is not. There, it is the + contents of the perf ctl MSR, which is opaque. We try hard to + provide "p-states" that go from 0...max, by analogy or equivalence + to the ACPI p-states. */ +#define PALACIOS_GOVNAME "v3vee" +#define MAX_PATH_LEN 128 +#define MAX_GOV_NAME_LEN 16 struct pstate_core_info { @@ -58,54 +87,82 @@ struct pstate_core_info { // V3_PSTATE_DIRECT_CONTROL // V3_PSTATE_INTERNAL_CONTROL uint32_t mode; - + // Apply if we are under the DIRECT state - uint8_t cur_pstate; - uint8_t max_pstate; - uint8_t min_pstate; + uint64_t cur_pstate; + uint64_t max_pstate; + uint64_t min_pstate; - uint8_t cur_hw_pstate; + uint64_t cur_hw_pstate; // Apply if we are under the EXTERNAL state + uint64_t set_freq_khz; // this is the frequency we're hoping to get uint64_t cur_freq_khz; uint64_t max_freq_khz; uint64_t min_freq_khz; - - // Intel-specific for DIRECT state + + // Intel-specific + uint8_t prior_speedstep; uint8_t turbo_disabled; uint8_t no_turbo; - + int have_cpufreq; - + + // This is where we stash Linux's governor when we make a mode switch + char * linux_governor; + // We have this so we can restore the original frequency when we started + uint64_t original_hz; + }; static DEFINE_PER_CPU(struct pstate_core_info, core_state); + // These are used to assert DIRECT control over the core pstates struct pstate_core_funcs { void (*arch_init)(void); void (*arch_deinit)(void); - uint8_t (*get_min_pstate)(void); - uint8_t (*get_max_pstate)(void); - uint8_t (*get_pstate)(void); - void (*set_pstate)(uint8_t pstate); + uint64_t (*get_min_pstate)(void); + uint64_t (*get_max_pstate)(void); + uint64_t (*get_pstate)(void); + void (*set_pstate)(uint64_t pstate); }; struct pstate_machine_info { enum {INTEL, AMD, OTHER } arch; int supports_pstates; + + + // For AMD + int have_pstate; + int have_coreboost; + int have_feedback; + + // For Intel + int have_speedstep; + int have_opportunistic; // this means "Turbo Boost" or "IDA" + int have_policy_hint; + int have_hwp; // hardware-controlled performance states + int have_hdc; // hardware duty cycling + int have_mwait_ext; // mwait power extensions + int have_mwait_int; // mwait wakes on interrupt + + // for both + int have_pstate_hw_coord; // mperf/aperf + // used for DIRECT control struct pstate_core_funcs *funcs; + }; static struct pstate_machine_info machine_state; /**************************************************** - AMD DIRECT CONTROL -***************************************************/ + AMD DIRECT CONTROL + ***************************************************/ /* AMD Programmer's Manual Vol 2 (Rev 3, 2013), Sec. 17.1, pp.557 */ #define MSR_PSTATE_LIMIT_REG_AMD 0xc0010061 @@ -149,22 +206,59 @@ struct p_state_ctl_reg_amd { /* CPUID Fn8000_0007_EDX[HwPstate(7)] = 1 */ static uint8_t supports_pstates_amd (void) { + int i; + int mapwrong=0; + int amd_num_pstates; + uint32_t eax, ebx, ecx, edx; + cpuid(0x80000007, &eax, &ebx, &ecx, &edx); - return !!(edx & (1 << 7)); + machine_state.have_pstate = !!(edx & (1 << 7)); + machine_state.have_coreboost = !!(edx & (1<<9)); + machine_state.have_feedback = !!(edx & (1<<11)); + + cpuid(0x6, &eax, &ebx, &ecx, &edx); + machine_state.have_pstate_hw_coord = !!(ecx & 1); + + INFO("P-State: AMD: Pstates=%d Coreboost=%d Feedback=%d PstateHWCoord=%d\n", + machine_state.have_pstate, + machine_state.have_coreboost, + machine_state.have_feedback, + machine_state.have_pstate_hw_coord); + + amd_num_pstates = get_cpu_var(processors)->performance->state_count; + if (amd_num_pstates) { + for (i=0;iperformance->states[i].core_frequency*1000, + get_cpu_var(processors)->performance->states[i].control, + get_cpu_var(processors)->performance->states[i].control != i ? (mapwrong=1, " ALERT - CTRL MAPPING NOT 1:1") : ""); + } + } + if (mapwrong) { + ERROR("P-State: AMD: mapping of pstate and control is not 1:1 on this processor - we will probably not work corrrectly\n"); + } + + return machine_state.have_pstate; + + } + static void init_arch_amd(void) { /* KCH: nothing to do here */ } + static void deinit_arch_amd(void) { /* KCH: nothing to do here */ } -static uint8_t get_pstate_amd(void) + +static uint64_t get_pstate_amd(void) { struct p_state_stat_reg_amd pstat; @@ -176,9 +270,16 @@ static uint8_t get_pstate_amd(void) return pstat.reg.pstate; } -static void set_pstate_amd(uint8_t p) + +static void set_pstate_amd(uint64_t p) { struct p_state_ctl_reg_amd pctl; + + if (p>get_cpu_var(core_state).max_pstate) { + p=get_cpu_var(core_state).max_pstate; + } + put_cpu_var(core_state); + pctl.val = 0; pctl.reg.cmd = p; @@ -188,10 +289,11 @@ static void set_pstate_amd(uint8_t p) put_cpu_var(core_state); } + /* * NOTE: HW may change this value at runtime */ -static uint8_t get_max_pstate_amd(void) +static uint64_t get_max_pstate_amd(void) { struct p_state_limit_reg_amd plimits; @@ -201,7 +303,7 @@ static uint8_t get_max_pstate_amd(void) } -static uint8_t get_min_pstate_amd(void) +static uint64_t get_min_pstate_amd(void) { struct p_state_limit_reg_amd plimits; @@ -225,8 +327,14 @@ static struct pstate_core_funcs amd_funcs = /*********************************************************** INTEL DIRECT CONTROL -**********************************************************/ + **********************************************************/ + +/* + This implementation uses SpeedStep, but does check + to see if the other features (MPERF/APERF, Turbo/IDA, HWP) + are available. +*/ /* Intel System Programmer's Manual Vol. 3B, 14-2 */ #define MSR_MPERF_IA32 0x000000e7 @@ -235,34 +343,174 @@ static struct pstate_core_funcs amd_funcs = #define MSR_NHM_TURBO_RATIO_LIMIT 0x000001ad #define MSR_PLATFORM_INFO_IA32 0x000000ce #define MSR_PERF_CTL_IA32 0x00000199 +#define MSR_PERF_STAT_IA32 0x00000198 +#define MSR_ENERY_PERF_BIAS_IA32 0x000001b0 + + +/* Note that the actual meaning of the pstate + in the control and status registers is actually + implementation dependent, unlike AMD. The "official" + way to figure it out the mapping from pstate to + these values is via ACPI. What is written in the register + is an "id" of an operation point + + "Often", the 16 bit field consists of a high order byte + which is the frequency (the multiplier) and the low order + byte is the voltage. + */ +// MSR_PERF_CTL_IA32 r/w +struct perf_ctl_reg_intel { + union { + uint64_t val; + struct { + // This is the target + // Note, not the ACPI pstate, but + // Intel's notion of pstate is that it's opaque + // for lots of implementations it seems to be + // frequency_id : voltage_id + // where frequency_id is typically the multiplier + uint16_t pstate : 16; + uint16_t reserved : 16; + // set to 1 to *disengage* dynamic acceleration + // Note that "IDA" and "Turbo" use the same interface + uint16_t dynamic_accel_disable : 1; + uint32_t reserved2 : 31; + } reg; + } __attribute__((packed)); +} __attribute__((packed)); +// MSR_PERF_STAT_IA32 r +struct perf_stat_reg_intel { + union { + uint64_t val; + struct { + // this is the current + uint16_t pstate : 16; + uint64_t reserved : 48; + } reg; + } __attribute__((packed)); +} __attribute__((packed)); +// MSR_ENERGY_PERF_BIAS_IA32 r/w +struct enery_perf_bias_reg_intel { + union { + uint64_t val; + struct { + // this is the current + uint8_t policy_hint : 4; + uint64_t reserved : 60; + } reg; + } __attribute__((packed)); +} __attribute__((packed)); +// MSR_PLATFORM_INFO struct turbo_mode_info_reg_intel { union { uint64_t val; struct { - uint8_t rsvd0; - uint8_t max_noturbo_ratio; - uint16_t rsvd1 : 12; - uint8_t ratio_limit : 1; + uint8_t rsvd0 : 8; + uint8_t max_noturbo_ratio : 8; + uint8_t rsvd1 : 7; + uint8_t ppin_cap : 1; + uint8_t rsvd2 : 4; + uint8_t ratio_limit : 1; uint8_t tdc_tdp_limit : 1; - uint16_t rsvd2 : 10; - uint8_t min_ratio; - uint16_t rsvd3; + uint16_t rsvd3 : 10; + uint8_t min_ratio : 8; + uint16_t rsvd4 : 16; } reg; } __attribute__((packed)); } __attribute__((packed)); - + +// This replicates the critical information in Linux's struct acpi_processor_px +// To make it easier to port to other OSes. +struct intel_pstate_info { + uint64_t freq; // KHz + uint64_t ctrl; // What to write into the _CTL MSR to get this +}; + +// The internal array will be used if we cannot build the table locally +static struct intel_pstate_info *intel_pstate_to_ctrl_internal=0; +static int intel_num_pstates_internal=0; + +// These will either point to the internal array or to a constructed array +static struct intel_pstate_info *intel_pstate_to_ctrl=0; +static int intel_num_pstates=0; + /* CPUID.01:ECX.AES(7) */ static uint8_t supports_pstates_intel(void) { /* NOTE: CPUID.06H:ECX.SETBH[bit 3] is set and it also implies the presence of a new architectural MSR called IA32_ENERGY_PERF_BIAS (1B0H). - */ + */ uint32_t eax, ebx, ecx, edx; + cpuid(0x1, &eax, &ebx, &ecx, &edx); - return !!(ecx & (1 << 7)); + machine_state.have_speedstep = !!(ecx & (1 << 7)); + + cpuid(0x6, &eax, &ebx, &ecx, &edx); + machine_state.have_pstate_hw_coord = !!(ecx & 1); // ? + machine_state.have_opportunistic = !!(eax & 1<<1); + machine_state.have_policy_hint = !!(ecx & 1<<3); + machine_state.have_hwp = !!(eax & 1<<7); + machine_state.have_hdc = !!(eax & 1<<13); + + cpuid(0x5, &eax, &ebx, &ecx, &edx); + machine_state.have_mwait_ext = !!(ecx & 1); + machine_state.have_mwait_int = !!(ecx & 1<<1); + + + // Note we test all the available hardware features documented as of August 2014 + // We are only currently using speed_step, however. + + INFO("P-State: Intel: Speedstep=%d, PstateHWCoord=%d, Opportunistic=%d PolicyHint=%d HWP=%d HDC=%d, MwaitExt=%d MwaitInt=%d \n", + machine_state.have_speedstep, + machine_state.have_pstate_hw_coord, + machine_state.have_opportunistic, + machine_state.have_policy_hint, + machine_state.have_hwp, + machine_state.have_hdc, + machine_state.have_mwait_ext, + machine_state.have_mwait_int ); + + + if (machine_state.have_speedstep) { + uint32_t i; + // Build mapping table (from "pstate" (0..) to ctrl value for MSR + if (!(get_cpu_var(processors)) || !(get_cpu_var(processors)->performance) ) { + put_cpu_var(processors); + // no acpi... revert to internal table + intel_pstate_to_ctrl=intel_pstate_to_ctrl_internal; + intel_num_pstates=intel_num_pstates_internal; + } else { + intel_num_pstates = get_cpu_var(processors)->performance->state_count; + if (intel_num_pstates) { + intel_pstate_to_ctrl = palacios_alloc(sizeof(struct intel_pstate_info)*intel_num_pstates); + if (!intel_pstate_to_ctrl) { + ERROR("P-State: Cannot allocate space for mapping...\n"); + intel_num_pstates=0; + } + for (i=0;iperformance->states[i].core_frequency*1000; + intel_pstate_to_ctrl[i].ctrl = get_cpu_var(processors)->performance->states[i].control; + } + + } else { + ERROR("P-State: Strange, machine has ACPI DVFS but no states...\n"); + } + } + put_cpu_var(processors); + INFO("P-State: Intel - State Mapping (%u states) follows\n",intel_num_pstates); + for (i=0;i> 16) & 0x1; + put_cpu_var(core_state); + + // enable speedstep (probably already on) + val |= 1 << 16; wrmsrl(MSR_MISC_ENABLE_IA32, val); + //INFO("P-State: write ENABLE=%llx\n",val); + } static void deinit_arch_intel(void) { - // ?? + uint64_t val; + + rdmsrl(MSR_MISC_ENABLE_IA32, val); + + //INFO("P-State: deinit: ENABLE=%llx\n",val); + + val &= ~(1ULL << 16); + val |= get_cpu_var(core_state).prior_speedstep << 16; + put_cpu_var(core_state); + + wrmsrl(MSR_MISC_ENABLE_IA32, val); + + //INFO("P-state: deinit ENABLE=%llx\n",val); + } /* TODO: Intel P-states require sampling at intervals... */ -static uint8_t get_pstate_intel(void) +static uint64_t get_pstate_intel(void) { - uint8_t pstate; + uint64_t val; - // This should read the HW... - pstate=get_cpu_var(core_state).cur_pstate; - put_cpu_var(core_state); - return pstate; + rdmsrl(MSR_PERF_STAT_IA32,val); + + //INFO("P-State: Get: 0x%llx\n", val); + + // should check if turbo is active, in which case + // this value is not the whole story + + return val; } - -static void set_pstate_intel(uint8_t p) + +static void set_pstate_intel(uint64_t p) { - uint64_t val = ((uint64_t)p) << 8; + uint64_t val; + uint64_t ctrl; - /* ...Intel IDA (dynamic acceleration) - if (c->no_turbo && !c->turbo_disabled) { - val |= 1 << 32; + if (intel_num_pstates==0) { + return ; + } else { + if (p>=intel_num_pstates) { + p=intel_num_pstates-1; + } } - */ + + ctrl=intel_pstate_to_ctrl[p].ctrl; + + /* ...Intel IDA (dynamic acceleration) + if (c->no_turbo && !c->turbo_disabled) { + val |= 1 << 32; + } + */ + // leave all bits along expect for the likely + // fid bits + + rdmsrl(MSR_PERF_CTL_IA32, val); + //INFO("P-State: Pre-Set: 0x%llx\n", val); + + val &= ~0xffffULL; + val |= ctrl & 0xffffULL; + + //INFO("P-State: Set: 0x%llx\n", val); wrmsrl(MSR_PERF_CTL_IA32, val); @@ -311,24 +605,20 @@ static void set_pstate_intel(uint8_t p) } -static uint8_t get_min_pstate_intel(void) +static uint64_t get_min_pstate_intel(void) { - struct turbo_mode_info_reg_intel t; - - rdmsrl(MSR_PLATFORM_INFO_IA32, t.val); - - return t.reg.min_ratio; + return 0; } -static uint8_t get_max_pstate_intel (void) +static uint64_t get_max_pstate_intel (void) { - struct turbo_mode_info_reg_intel t; - - rdmsrl(MSR_PLATFORM_INFO_IA32, t.val); - - return t.reg.max_noturbo_ratio; + if (intel_num_pstates==0) { + return 0; + } else { + return intel_num_pstates-1; + } } static struct pstate_core_funcs intel_funcs = @@ -345,70 +635,70 @@ static struct pstate_core_funcs intel_funcs = /*********************************************** Arch determination and setup -***********************************************/ - + ***********************************************/ + static inline void cpuid_string (uint32_t id, uint32_t dest[4]) { asm volatile("cpuid" - :"=a"(*dest),"=b"(*(dest+1)),"=c"(*(dest+2)),"=d"(*(dest+3)) - :"a"(id)); + :"=a"(*dest),"=b"(*(dest+1)),"=c"(*(dest+2)),"=d"(*(dest+3)) + :"a"(id)); } - + static int get_cpu_vendor (char name[13]) { uint32_t dest[4]; uint32_t maxid; - + cpuid_string(0,dest); maxid=dest[0]; ((uint32_t*)name)[0]=dest[1]; ((uint32_t*)name)[1]=dest[3]; ((uint32_t*)name)[2]=dest[2]; name[12]=0; - + return maxid; } static int is_intel (void) { - char name[13]; - get_cpu_vendor(name); - return !strcmp(name,"GenuineIntel"); + char name[13]; + get_cpu_vendor(name); + return !strcmp(name,"GenuineIntel"); } static int is_amd (void) { - char name[13]; - get_cpu_vendor(name); - return !strcmp(name,"AuthenticAMD"); + char name[13]; + get_cpu_vendor(name); + return !strcmp(name,"AuthenticAMD"); } static int pstate_arch_setup(void) { - + if (is_amd()) { machine_state.arch = AMD; machine_state.funcs = &amd_funcs; - machine_state.supports_pstates = supports_pstates_amd(); - INFO("PSTATE: P-State initialized for AMD\n"); + machine_state.supports_pstates = supports_pstates_amd(); + INFO("PSTATE: P-State initialized for AMD\n"); } else if (is_intel()) { machine_state.arch = INTEL; machine_state.funcs = &intel_funcs; - machine_state.supports_pstates = supports_pstates_intel(); + machine_state.supports_pstates = supports_pstates_intel(); INFO("PSTATE: P-State initialized for INTEL (Work in progress...)\n"); return 0; - + } else { - machine_state.arch = OTHER; - machine_state.funcs = NULL; - machine_state.supports_pstates = 0; + machine_state.arch = OTHER; + machine_state.funcs = NULL; + machine_state.supports_pstates = 0; INFO("PSTATE: P-state control: No support for direct control on this architecture\n"); return 0; } - + return 0; } @@ -416,64 +706,556 @@ static int pstate_arch_setup(void) /****************************************************************** Linux Interface -*****************************************************************/ + *****************************************************************/ + +static unsigned cpus_using_v3_governor; +static DEFINE_MUTEX(v3_governor_mutex); -#if 0 -// The purpose of the stub governor is the pretend to keep -// the processor at the maximum frequency, while we manipulate he -// processor ccre directly +/* KCH: this will tell us when there is an actual frequency transition */ +static int v3_cpufreq_notifier(struct notifier_block *nb, unsigned long val, + void *data) +{ + struct cpufreq_freqs *freq = data; + + if (per_cpu(core_state, freq->cpu).mode != V3_PSTATE_EXTERNAL_CONTROL) { + return 0; + } + + if (val == CPUFREQ_POSTCHANGE) { + DEBUG("P-State: frequency change took effect on cpu %u (now %u kHz)\n", + freq->cpu, freq->new); + per_cpu(core_state, freq->cpu).cur_freq_khz = freq->new; + } + + return 0; + +} + + +static struct notifier_block v3_cpufreq_notifier_block = { + .notifier_call = v3_cpufreq_notifier +}; + + +/* + * This stub governor is simply a placeholder for preventing + * frequency changes from the Linux side. For now, we simply leave + * the frequency as is when we acquire control. + */ static int governor_run(struct cpufreq_policy *policy, unsigned int event) { - switch (event) { - case CPUFREQ_GOV_START: - case CPUFREQ_GOV_STOP: - cpu_freq_driver_target(policy, policy->max_freq); + unsigned cpu = policy->cpu; - case CPUFREQ_GOV_LIMITS: + switch (event) { + /* we can't use cpufreq_driver_target here as it can result + * in a circular dependency, so we'll keep the current frequency as is + */ + case CPUFREQ_GOV_START: + BUG_ON(!policy->cur); + + mutex_lock(&v3_governor_mutex); + + if (cpus_using_v3_governor == 0) { + cpufreq_register_notifier(&v3_cpufreq_notifier_block, + CPUFREQ_TRANSITION_NOTIFIER); + } + + cpus_using_v3_governor++; + + per_cpu(core_state, cpu).set_freq_khz = policy->cur; + per_cpu(core_state, cpu).cur_freq_khz = policy->cur; + per_cpu(core_state, cpu).max_freq_khz = policy->max; + per_cpu(core_state, cpu).min_freq_khz = policy->min; + + mutex_unlock(&v3_governor_mutex); + break; + case CPUFREQ_GOV_STOP: + mutex_lock(&v3_governor_mutex); + + cpus_using_v3_governor--; + + if (cpus_using_v3_governor == 0) { + cpufreq_unregister_notifier( + &v3_cpufreq_notifier_block, + CPUFREQ_TRANSITION_NOTIFIER); + } + + per_cpu(core_state, cpu).set_freq_khz = 0; + per_cpu(core_state, cpu).cur_freq_khz = 0; + per_cpu(core_state, cpu).max_freq_khz = 0; + per_cpu(core_state, cpu).min_freq_khz = 0; + + mutex_unlock(&v3_governor_mutex); + break; + case CPUFREQ_GOV_LIMITS: + /* do nothing */ + break; + default: + ERROR("Undefined governor command (%u)\n", event); + return -1; } + + return 0; } + static struct cpufreq_governor stub_governor = { - .name="PALACIOS_STUB", - .governor=governor_run, - .owner=.THIS_MODULE, + .name = PALACIOS_GOVNAME, + .governor = governor_run, + .owner = THIS_MODULE, +}; + + +static struct workqueue_struct *pstate_wq; + +typedef struct { + struct work_struct work; + uint64_t freq; +} pstate_work_t; + + + +static inline void pstate_register_linux_governor(void) +{ + cpufreq_register_governor(&stub_governor); +} + + +static inline void pstate_unregister_linux_governor(void) +{ + cpufreq_unregister_governor(&stub_governor); } -static void linux_init(void) + +static int pstate_linux_init(void) { - // get_policy - // - // change to userspace governor - or change to our do nothing governor? (call set_speed) - // stash the old governor - // tell governor to do max freq + pstate_register_linux_governor(); + pstate_wq = create_workqueue("v3vee_pstate_wq"); + if (!pstate_wq) { + ERROR("Could not create work queue\n"); + goto out_err; + } + + return 0; +out_err: + pstate_unregister_linux_governor(); + return -1; } -static void linux_deinit(void) + +static void pstate_linux_deinit(void) { + pstate_unregister_linux_governor(); + flush_workqueue(pstate_wq); + destroy_workqueue(pstate_wq); } -static uint8_t linux_get_pstate(void) + +static int get_current_governor(char **buf, unsigned int cpu) { + struct cpufreq_policy * policy = palacios_alloc(sizeof(struct cpufreq_policy)); + char * govname = NULL; + + if (!policy) { + ERROR("could not allocate cpufreq_policy\n"); + return -1; + } + + if (cpufreq_get_policy(policy, cpu) != 0) { + ERROR("Could not get current cpufreq policy\n"); + goto out_err; + } + + /* We're in interrupt context, should probably not wait here */ + govname = palacios_alloc(MAX_GOV_NAME_LEN); + if (!govname) { + ERROR("Could not allocate space for governor name\n"); + goto out_err; + } + + strncpy(govname, policy->governor->name, MAX_GOV_NAME_LEN); + govname[MAX_GOV_NAME_LEN-1] = 0; + + get_cpu_var(core_state).linux_governor = govname; + put_cpu_var(core_state); + + *buf = govname; + + palacios_free(policy); + return 0; + +out_err: + palacios_free(policy); + return -1; } -static void linux_set_pstate(uint8_t p) + +/* passed to the userspacehelper interface for cleanup */ +static void gov_switch_cleanup(struct subprocess_info * s) { + palacios_free(s->argv[2]); + palacios_free(s->argv); } -static void linux_restore_defaults(void) + +/* + * Switch governors + * @s - the governor to switch to + * TODO: this should probably be submitted to a work queue + * so we don't have to run it in interrupt context + */ +static int governor_switch(char * s, unsigned int cpu) { -} + char * path_str = NULL; + char ** argv = NULL; + + static char * envp[] = { + "HOME=/", + "TERM=linux", + "PATH=/sbin:/bin:/usr/sbin:/usr/bin", NULL }; + + + argv = palacios_alloc(4*sizeof(char*)); + if (!argv) { + ERROR("Couldn't allocate argv struct\n"); + return -1; + } + path_str = palacios_alloc(MAX_PATH_LEN); + if (!path_str) { + ERROR("Couldn't allocate path string\n"); + goto out_freeargv; + } + memset(path_str, 0, MAX_PATH_LEN); + + snprintf(path_str, MAX_PATH_LEN, "echo %s > /sys/devices/system/cpu/cpu%u/cpufreq/scaling_governor", s, cpu); + + argv[0] = "/bin/sh"; + argv[1] = "-c"; + argv[2] = path_str; + argv[3] = NULL; + + /* KCH: we can't wait here to actually see if we succeeded, we're in interrupt context */ + +#if LINUX_VERSION_CODE <= KERNEL_VERSION(3,9,0) + return call_usermodehelper_fns("/bin/sh", argv, envp, UMH_NO_WAIT, NULL, gov_switch_cleanup, NULL); +#else + { + struct subprocess_info *sp; + + sp = call_usermodehelper_setup("/bin/sh", argv, envp, GFP_ATOMIC, NULL, gov_switch_cleanup, NULL); + if (!sp) { + goto out_freeargv; + } + + return call_usermodehelper_exec(sp,0); + } #endif + +out_freeargv: + palacios_free(argv); + return -1; +} + + +static inline void free_linux_governor(void) +{ + palacios_free(get_cpu_var(core_state).linux_governor); + put_cpu_var(core_state); +} + + +static int linux_setup_palacios_governor(void) +{ + char * gov; + unsigned int cpu = get_cpu(); + put_cpu(); + + /* KCH: we assume the v3vee governor is already + * registered with kernel by this point + */ + + if (get_current_governor(&gov, cpu) < 0) { + ERROR("Could not get current governor\n"); + return -1; + } + + DEBUG("saving current governor (%s)\n", gov); + + get_cpu_var(core_state).linux_governor = gov; + put_cpu_var(core_state); + + DEBUG("setting the new governor (%s)\n", PALACIOS_GOVNAME); + + /* set the new one to ours */ + + if (governor_switch(PALACIOS_GOVNAME, cpu) < 0) { + ERROR("Could not set governor to (%s)\n", PALACIOS_GOVNAME); + return -1; + } + + return 0; +} + + + +static uint64_t linux_get_pstate(void) +{ + struct cpufreq_policy * policy = NULL; + struct cpufreq_frequency_table *table; + unsigned int i = 0; + unsigned int count = 0; + unsigned int cpu = get_cpu(); + put_cpu(); + + + policy = palacios_alloc(sizeof(struct cpufreq_policy)); + if (!policy) { + ERROR("Could not allocate policy struct\n"); + return -1; + } + + cpufreq_get_policy(policy, cpu); + table = cpufreq_frequency_get_table(cpu); + + for (i = 0; (table[i].frequency != CPUFREQ_TABLE_END); i++) { + + if (table[i].frequency == CPUFREQ_ENTRY_INVALID) { + continue; + } + + if (table[i].frequency == policy->cur) { + break; + } + + count++; + } + + palacios_free(policy); + + put_cpu(); + return count; +} + + +static uint64_t linux_get_freq(void) +{ + uint64_t freq; + struct cpufreq_policy * policy = NULL; + unsigned int cpu = get_cpu(); + put_cpu(); + + policy = palacios_alloc(sizeof(struct cpufreq_policy)); + if (!policy) { + ERROR("Could not allocate policy struct\n"); + return -1; + } + + if (cpufreq_get_policy(policy, cpu)) { + ERROR("Could not get current policy\n"); + return -1; + } + + freq=policy->cur; + + palacios_free(policy); + + return freq; +} + +static void +pstate_switch_workfn (struct work_struct *work) +{ + pstate_work_t * pwork = (pstate_work_t*)work; + struct cpufreq_policy * policy = NULL; + uint64_t freq; + unsigned int cpu = get_cpu(); + put_cpu(); + + mutex_lock(&v3_governor_mutex); + + policy = palacios_alloc(sizeof(struct cpufreq_policy)); + if (!policy) { + ERROR("Could not allocate space for cpufreq policy\n"); + goto out; + } + + if (cpufreq_get_policy(policy, cpu) != 0) { + ERROR("Could not get cpufreq policy\n"); + goto out1; + } + + freq = pwork->freq; + get_cpu_var(core_state).set_freq_khz = freq; + + if (freq < get_cpu_var(core_state).min_freq_khz) { + freq = get_cpu_var(core_state).min_freq_khz; + } + if (freq > get_cpu_var(core_state).max_freq_khz) { + freq = get_cpu_var(core_state).max_freq_khz; + } + put_cpu_var(core_state); + + INFO("P-state: requesting frequency change on core %u to %llu\n", cpu, freq); + __cpufreq_driver_target(policy, freq, CPUFREQ_RELATION_L); + +out1: + palacios_free(policy); +out: + palacios_free(work); + mutex_unlock(&v3_governor_mutex); +} + + +static int linux_set_pstate(uint64_t p) +{ + struct cpufreq_policy * policy = NULL; + struct cpufreq_frequency_table *table; + pstate_work_t * work = NULL; + unsigned int i = 0; + unsigned int count = 0; + int state_set = 0; + int last_valid = 0; + unsigned int cpu = get_cpu(); + put_cpu(); + + policy = palacios_alloc(sizeof(struct cpufreq_policy)); + if (!policy) { + ERROR("Could not allocate policy struct\n"); + return -1; + } + + work = (pstate_work_t*)palacios_alloc(sizeof(pstate_work_t)); + if (!work) { + ERROR("Could not allocate work struct\n"); + goto out_err; + } + + if (cpufreq_get_policy(policy, cpu)) { + ERROR("Could not get current policy\n"); + goto out_err1; + } + table = cpufreq_frequency_get_table(cpu); + + for (i = 0; (table[i].frequency != CPUFREQ_TABLE_END); i++) { + + if (table[i].frequency == CPUFREQ_ENTRY_INVALID) { + continue; + } + + if (count == p) { + + INIT_WORK((struct work_struct*)work, pstate_switch_workfn); + work->freq = table[i].frequency; + queue_work(pstate_wq, (struct work_struct*)work); + + state_set = 1; + break; + } + + count++; + last_valid = i; + } + + /* we need to deal with the case in which we get a number > max pstate */ + if (!state_set) { + INIT_WORK((struct work_struct*)work, pstate_switch_workfn); + work->freq = table[last_valid].frequency; + queue_work(pstate_wq, (struct work_struct*)work); + } + + palacios_free(policy); + return 0; + +out_err1: + palacios_free(work); +out_err: + palacios_free(policy); + return -1; +} + + +static int linux_set_freq(uint64_t f) +{ + struct cpufreq_policy * policy = NULL; + pstate_work_t * work = NULL; + uint64_t freq; + unsigned int cpu = get_cpu(); + put_cpu(); + + policy = palacios_alloc(sizeof(struct cpufreq_policy)); + if (!policy) { + ERROR("Could not allocate policy struct\n"); + return -1; + } + + work = (pstate_work_t*)palacios_alloc(sizeof(pstate_work_t)); + if (!work) { + ERROR("Could not allocate work struct\n"); + goto out_err; + } + + if (cpufreq_get_policy(policy, cpu) != 0) { + ERROR("Could not get cpufreq policy\n"); + goto out_err1; + } + + if (f < policy->min) { + freq = policy->min; + } else if (f > policy->max) { + freq = policy->max; + } else { + freq = f; + } + + INIT_WORK((struct work_struct*)work, pstate_switch_workfn); + work->freq = freq; + queue_work(pstate_wq, (struct work_struct*)work); + + palacios_free(policy); + return 0; + +out_err1: + palacios_free(work); +out_err: + palacios_free(policy); + return -1; +} + + +static int linux_restore_defaults(void) +{ + char * gov = NULL; + unsigned int cpu = get_cpu(); + put_cpu(); + + gov = get_cpu_var(core_state).linux_governor; + put_cpu_var(core_state); + + DEBUG("restoring previous governor (%s)\n", gov); + + if (governor_switch(gov, cpu) < 0) { + ERROR("Could not restore governor to (%s)\n", gov); + goto out_err; + } + + free_linux_governor(); + return 0; + +out_err: + free_linux_governor(); + return -1; +} + /****************************************************************** Generic Interface as provided to Palacios and to the rest of the module -******************************************************************/ + ******************************************************************/ static void init_core(void) { @@ -481,17 +1263,17 @@ static void init_core(void) struct cpufreq_policy *p; - DEBUG("P-State Core Init\n"); + //DEBUG("P-State Core Init\n"); get_cpu_var(core_state).mode = V3_PSTATE_HOST_CONTROL; get_cpu_var(core_state).cur_pstate = 0; - + if (machine_state.funcs) { - get_cpu_var(core_state).min_pstate = machine_state.funcs->get_min_pstate(); - get_cpu_var(core_state).max_pstate = machine_state.funcs->get_max_pstate(); + get_cpu_var(core_state).min_pstate = machine_state.funcs->get_min_pstate(); + get_cpu_var(core_state).max_pstate = machine_state.funcs->get_max_pstate(); } else { - get_cpu_var(core_state).min_pstate = 0; - get_cpu_var(core_state).max_pstate = 0; + get_cpu_var(core_state).min_pstate = 0; + get_cpu_var(core_state).max_pstate = 0; } @@ -500,20 +1282,26 @@ static void init_core(void) p = cpufreq_cpu_get(cpu); if (!p) { - get_cpu_var(core_state).have_cpufreq = 0; - get_cpu_var(core_state).min_freq_khz=0; - get_cpu_var(core_state).max_freq_khz=0; - get_cpu_var(core_state).cur_freq_khz=0; + get_cpu_var(core_state).have_cpufreq = 0; + get_cpu_var(core_state).min_freq_khz=0; + get_cpu_var(core_state).max_freq_khz=0; + get_cpu_var(core_state).cur_freq_khz=0; } else { - get_cpu_var(core_state).have_cpufreq = 1; - get_cpu_var(core_state).min_freq_khz=p->min; - get_cpu_var(core_state).max_freq_khz=p->max; - get_cpu_var(core_state).cur_freq_khz=p->cur; - cpufreq_cpu_put(p); - } - + get_cpu_var(core_state).have_cpufreq = 1; + get_cpu_var(core_state).min_freq_khz=p->min; + get_cpu_var(core_state).max_freq_khz=p->max; + get_cpu_var(core_state).cur_freq_khz=p->cur; } cpufreq_cpu_put(p); put_cpu_var(core_state); - + + /* + for (i=0;iperformance->state_count; i++) { + INFO("P-State: %u: freq=%llu ctrl=%llx", + i, + get_cpu_var(processors)->performance->states[i].core_frequency*1000, + get_cpu_var(processors)->performance->states[i].control); + } + put_cpu_var(processors); + */ } @@ -524,6 +1312,7 @@ static void deinit_core(void) { DEBUG("P-State Core Deinit\n"); palacios_pstate_ctrl_release(); + } @@ -531,16 +1320,16 @@ static void deinit_core(void) void palacios_pstate_ctrl_get_chars(struct v3_cpu_pstate_chars *c) { memset(c,0,sizeof(struct v3_cpu_pstate_chars)); - + c->features = V3_PSTATE_INTERNAL_CONTROL; if (get_cpu_var(core_state).have_cpufreq) { - c->features |= V3_PSTATE_EXTERNAL_CONTROL; + c->features |= V3_PSTATE_EXTERNAL_CONTROL; } if (machine_state.arch==AMD || machine_state.arch==INTEL) { - c->features |= V3_PSTATE_DIRECT_CONTROL; + c->features |= V3_PSTATE_DIRECT_CONTROL; } c->cur_mode = get_cpu_var(core_state).mode; c->min_pstate = get_cpu_var(core_state).min_pstate; @@ -552,28 +1341,37 @@ void palacios_pstate_ctrl_get_chars(struct v3_cpu_pstate_chars *c) put_cpu_var(core_state); - - + + } -uint8_t palacios_pstate_ctrl_get_pstate(void) +uint64_t palacios_pstate_ctrl_get_pstate(void) { if (get_cpu_var(core_state).mode==V3_PSTATE_DIRECT_CONTROL) { - put_cpu_var(core_state); - return machine_state.funcs->get_pstate(); + put_cpu_var(core_state); + return machine_state.funcs->get_pstate(); + } else if (get_cpu_var(core_state).mode==V3_PSTATE_EXTERNAL_CONTROL) { + put_cpu_var(core_state); + return linux_get_pstate(); } else { - put_cpu_var(core_state); - return 0; + put_cpu_var(core_state); + return 0; } } -void palacios_pstate_ctrl_set_pstate(uint8_t p) + +void palacios_pstate_ctrl_set_pstate(uint64_t p) { if (get_cpu_var(core_state).mode==V3_PSTATE_DIRECT_CONTROL) { - put_cpu_var(core_state); - machine_state.funcs->set_pstate(p); - } + put_cpu_var(core_state); + machine_state.funcs->set_pstate(p); + } else if (get_cpu_var(core_state).mode==V3_PSTATE_EXTERNAL_CONTROL) { + put_cpu_var(core_state); + linux_set_pstate(p); + } else { + put_cpu_var(core_state); + } } @@ -582,128 +1380,162 @@ void palacios_pstate_ctrl_set_pstate_wrapper(void *p) palacios_pstate_ctrl_set_pstate((uint8_t)(uint64_t)p); } + uint64_t palacios_pstate_ctrl_get_freq(void) { if (get_cpu_var(core_state).mode==V3_PSTATE_EXTERNAL_CONTROL) { - put_cpu_var(core_state); - ERROR("Unimplemented get freq\n"); - return 0; + put_cpu_var(core_state); + return linux_get_freq(); } else { - put_cpu_var(core_state); - return 0; + put_cpu_var(core_state); + return 0; } } + void palacios_pstate_ctrl_set_freq(uint64_t p) { if (get_cpu_var(core_state).mode==V3_PSTATE_EXTERNAL_CONTROL) { - put_cpu_var(core_state); - ERROR("Unimplemented set freq\n"); - } - put_cpu_var(core_state); - + put_cpu_var(core_state); + linux_set_freq(p); + } else { + put_cpu_var(core_state); + } } -static void switch_to_external(void) +static int switch_to_external(void) { + DEBUG("switch from host control to external\n"); + if (!(get_cpu_var(core_state).have_cpufreq)) { - put_cpu_var(core_state); - ERROR("No cpufreq - cannot switch to external...\n"); - return; - } + put_cpu_var(core_state); + ERROR("No cpufreq - cannot switch to external...\n"); + return -1; + } put_cpu_var(core_state); - ERROR("Unimplemented switch to external...\n"); + linux_setup_palacios_governor(); + + get_cpu_var(core_state).mode=V3_PSTATE_EXTERNAL_CONTROL; + put_cpu_var(core_state); + + return 0; } - -static void switch_to_direct(void) + + +static int switch_to_direct(void) { + DEBUG("switch from host control to direct\n"); + if (get_cpu_var(core_state).have_cpufreq) { - put_cpu_var(core_state); - ERROR("Unimplemented: switch to direct on machine with cpu freq\n"); - // The implementation would set the policy and governor to peg cpu - // regardless of load + put_cpu_var(core_state); + DEBUG("switch to direct from cpufreq\n"); + + // The implementation would set the policy and governor to peg cpu + // regardless of load + linux_setup_palacios_governor(); + } else { + put_cpu_var(core_state); } if (machine_state.funcs && machine_state.funcs->arch_init) { - get_cpu_var(core_state).mode=V3_PSTATE_DIRECT_CONTROL; - - machine_state.funcs->arch_init(); + get_cpu_var(core_state).mode=V3_PSTATE_DIRECT_CONTROL; + + machine_state.funcs->arch_init(); - put_cpu_var(core_state); + put_cpu_var(core_state); } + return 0; } - -static void switch_to_internal(void) + +static int switch_to_internal(void) { + DEBUG("switch from host control to internal\n"); + if (get_cpu_var(core_state).have_cpufreq) { - put_cpu_var(core_state); - ERROR("Unimplemented: switch to internal on machine with cpu freq\n"); - return; - // The implementation would set the policy and governor to peg cpu - // regardless of load - exactly like direct + put_cpu_var(core_state); + DEBUG("switch to internal on machine with cpu freq\n"); + linux_setup_palacios_governor(); + } else { + put_cpu_var(core_state); } get_cpu_var(core_state).mode=V3_PSTATE_INTERNAL_CONTROL; - + put_cpu_var(core_state); - return; + return 0; } -static void switch_from_external(void) +static int switch_from_external(void) { if (!(get_cpu_var(core_state).have_cpufreq)) { - put_cpu_var(core_state); - ERROR("No cpufreq - how did we get here... external...\n"); - return; + put_cpu_var(core_state); + ERROR("No cpufreq - how did we get here... external...\n"); + return -1; } + put_cpu_var(core_state); - ERROR("Unimplemented switch from external...\n"); - - get_cpu_var(core_state).mode = V3_PSTATE_HOST_CONTROL; + DEBUG("Switching back to host control from external\n"); + if (get_cpu_var(core_state).have_cpufreq) { + put_cpu_var(core_state); + linux_restore_defaults(); + } else { + put_cpu_var(core_state); + } + + get_cpu_var(core_state).mode = V3_PSTATE_HOST_CONTROL; put_cpu_var(core_state); + return 0; } - -static void switch_from_direct(void) -{ - - if (get_cpu_var(core_state).have_cpufreq) { - put_cpu_var(core_state); - ERROR("Unimplemented: switch from direct on machine with cpu freq - will just pretend to do so\n"); - // The implementation would switch back to default policy and governor - } - get_cpu_var(core_state).mode=V3_PSTATE_HOST_CONTROL; +static int switch_from_direct(void) +{ - machine_state.funcs->set_pstate(get_cpu_var(core_state).min_pstate); + DEBUG("Switching back to host control from direct\n"); + // Set maximum performance, just in case there is no host control + machine_state.funcs->set_pstate(get_cpu_var(core_state).min_pstate); machine_state.funcs->arch_deinit(); + if (get_cpu_var(core_state).have_cpufreq) { + put_cpu_var(core_state); + linux_restore_defaults(); + } else { + put_cpu_var(core_state); + } + + get_cpu_var(core_state).mode=V3_PSTATE_HOST_CONTROL; + put_cpu_var(core_state); + + return 0; } - -static void switch_from_internal(void) + +static int switch_from_internal(void) { + DEBUG("Switching back to host control from internal\n"); + if (get_cpu_var(core_state).have_cpufreq) { - put_cpu_var(core_state); - ERROR("Unimplemented: switch from internal on machine with cpu freq - will just pretend to do so\n"); - // The implementation would switch back to default policy and governor + put_cpu_var(core_state); + linux_restore_defaults(); + } else { + put_cpu_var(core_state); } get_cpu_var(core_state).mode=V3_PSTATE_HOST_CONTROL; put_cpu_var(core_state); - - return; + + return 0; } @@ -711,24 +1543,25 @@ static void switch_from_internal(void) void palacios_pstate_ctrl_acquire(uint32_t type) { if (get_cpu_var(core_state).mode != V3_PSTATE_HOST_CONTROL) { - palacios_pstate_ctrl_release(); + put_cpu_var(core_state); + palacios_pstate_ctrl_release(); + } else { + put_cpu_var(core_state); } - put_cpu_var(core_state); - switch (type) { - case V3_PSTATE_EXTERNAL_CONTROL: - switch_to_external(); - break; - case V3_PSTATE_DIRECT_CONTROL: - switch_to_direct(); - break; - case V3_PSTATE_INTERNAL_CONTROL: - switch_to_internal(); - break; - default: - ERROR("Unknown pstate control type %u\n",type); - break; + case V3_PSTATE_EXTERNAL_CONTROL: + switch_to_external(); + break; + case V3_PSTATE_DIRECT_CONTROL: + switch_to_direct(); + break; + case V3_PSTATE_INTERNAL_CONTROL: + switch_to_internal(); + break; + default: + ERROR("Unknown pstate control type %u\n",type); + break; } } @@ -747,47 +1580,48 @@ static void palacios_pstate_ctrl_acquire_direct(void) void palacios_pstate_ctrl_release(void) { - if (get_cpu_var(core_state).mode == V3_PSTATE_HOST_CONTROL) { - put_cpu_var(core_state); - return; - } + put_cpu_var(core_state); + return; + } + put_cpu_var(core_state); switch (get_cpu_var(core_state).mode) { - case V3_PSTATE_EXTERNAL_CONTROL: - switch_from_external(); - break; - case V3_PSTATE_DIRECT_CONTROL: - switch_from_direct(); - break; - case V3_PSTATE_INTERNAL_CONTROL: - switch_from_internal(); - break; - default: - ERROR("Unknown pstate control type %u\n",core_state.mode); - break; + case V3_PSTATE_EXTERNAL_CONTROL: + put_cpu_var(core_state); + switch_from_external(); + break; + case V3_PSTATE_DIRECT_CONTROL: + put_cpu_var(core_state); + switch_from_direct(); + break; + case V3_PSTATE_INTERNAL_CONTROL: + put_cpu_var(core_state); + switch_from_internal(); + break; + default: + put_cpu_var(core_state); + ERROR("Unknown pstate control type %u\n",core_state.mode); + break; } - - put_cpu_var(core_state); - } static void update_hw_pstate(void *arg) { if (machine_state.funcs && machine_state.funcs->get_pstate) { - get_cpu_var(core_state).cur_hw_pstate = machine_state.funcs->get_pstate(); - put_cpu_var(core_state); + get_cpu_var(core_state).cur_hw_pstate = machine_state.funcs->get_pstate(); + put_cpu_var(core_state); } else { - get_cpu_var(core_state).cur_hw_pstate = 0; - put_cpu_var(core_state); + get_cpu_var(core_state).cur_hw_pstate = 0; + put_cpu_var(core_state); } } /*************************************************************************** PROC Interface to expose state -***************************************************************************/ + ***************************************************************************/ static int pstate_show(struct seq_file * file, void * v) { @@ -797,36 +1631,24 @@ static int pstate_show(struct seq_file * file, void * v) seq_printf(file, "V3VEE DVFS Status\n\n"); for (cpu=0;cpucur_hw_pstate, - s->mode==V3_PSTATE_HOST_CONTROL ? "host" : - s->mode==V3_PSTATE_EXTERNAL_CONTROL ? "external" : - s->mode==V3_PSTATE_DIRECT_CONTROL ? "direct" : - s->mode==V3_PSTATE_INTERNAL_CONTROL ? "internal" : "UNKNOWN"); - if (s->have_cpufreq) { - seq_printf(file," external "); - } - if (machine_state.arch==AMD || machine_state.arch==INTEL) { - seq_printf(file,"direct "); - } - seq_printf(file,"internal ] "); - if (s->mode==V3_PSTATE_EXTERNAL_CONTROL) { - seq_printf(file,"(min=%llu max=%llu cur=%llu) ", s->min_freq_khz, s->max_freq_khz, s->cur_freq_khz); - } - if (s->mode==V3_PSTATE_DIRECT_CONTROL) { - seq_printf(file,"(min=%u max=%u cur=%u) ", (uint32_t)s->min_pstate, (uint32_t)s->max_pstate, (uint32_t)s->cur_pstate); - } - seq_printf(file,"\n"); + struct pstate_core_info *s = &per_cpu(core_state,cpu); + seq_printf(file,"pcore %u: hw pstate 0x%llx mode %s ",cpu, + s->cur_hw_pstate, + s->mode==V3_PSTATE_HOST_CONTROL ? "host" : + s->mode==V3_PSTATE_EXTERNAL_CONTROL ? "external" : + s->mode==V3_PSTATE_DIRECT_CONTROL ? "direct" : + s->mode==V3_PSTATE_INTERNAL_CONTROL ? "internal" : "UNKNOWN"); + if (s->mode==V3_PSTATE_EXTERNAL_CONTROL) { + seq_printf(file,"(min=%llu max=%llu cur=%llu) ", s->min_freq_khz, s->max_freq_khz, s->cur_freq_khz); + } + if (s->mode==V3_PSTATE_DIRECT_CONTROL) { + seq_printf(file,"(min=%llu max=%llu cur=%llu) ",s->min_pstate, s->max_pstate, s->cur_pstate); + } + seq_printf(file,"\n"); } return 0; } @@ -845,81 +1667,174 @@ static struct file_operations pstate_fops = { .release = seq_release }; +static int pstate_hw_show(struct seq_file * file, void * v) +{ + int numstates; + + seq_printf(file, "V3VEE DVFS Hardware Info\n(all logical cores assumed identical)\n\n"); + + seq_printf(file, "Arch: \t%s\n" + "PStates:\t%s\n\n", + machine_state.arch==INTEL ? "Intel" : + machine_state.arch==AMD ? "AMD" : "Other", + machine_state.supports_pstates ? "Yes" : "No"); + + +#define YN(x) ((x) ? "Y" : "N") + + if (machine_state.arch==INTEL) { + seq_printf(file,"SpeedStep: \t%s\n",YN(machine_state.have_speedstep)); + seq_printf(file,"APERF/MPERF: \t%s\n",YN(machine_state.have_pstate_hw_coord)); + seq_printf(file,"IDA or TurboCore: \t%s\n",YN(machine_state.have_opportunistic)); + seq_printf(file,"Policy Hint: \t%s\n",YN(machine_state.have_policy_hint)); + seq_printf(file,"Hardware Policy: \t%s\n",YN(machine_state.have_hwp)); + seq_printf(file,"Hardware Duty Cycle: \t%s\n",YN(machine_state.have_hdc)); + seq_printf(file,"MWAIT extensions: \t%s\n",YN(machine_state.have_mwait_ext)); + seq_printf(file,"MWAIT wake on intr: \t%s\n",YN(machine_state.have_mwait_int)); + } + + if (machine_state.arch==AMD) { + seq_printf(file,"PState: \t%s\n",YN(machine_state.have_pstate)); + seq_printf(file,"APERF/MPERF: \t%s\n",YN(machine_state.have_pstate_hw_coord)); + seq_printf(file,"CoreBoost: \t%s\n",YN(machine_state.have_coreboost)); + seq_printf(file,"Feedback: \t%s\n",YN(machine_state.have_feedback)); + } + + + seq_printf(file,"\nPstate\tCtrl\tKHz\tmW\tuS(X)\tuS(B)\n"); + numstates = get_cpu_var(processors)->performance->state_count; + if (!numstates) { + seq_printf(file,"UNKNOWN\n"); + } else { + int i; + for (i=0;iperformance->states[i].control, + get_cpu_var(processors)->performance->states[i].core_frequency*1000, + get_cpu_var(processors)->performance->states[i].power, + get_cpu_var(processors)->performance->states[i].transition_latency, + get_cpu_var(processors)->performance->states[i].bus_master_latency); + } + } + put_cpu_var(processors); + + seq_printf(file,"\nAvailable Modes:"); + seq_printf(file," host"); + if (get_cpu_var(core_state).have_cpufreq) { + seq_printf(file," external"); + } + put_cpu_var(core_state); + if (machine_state.supports_pstates) { + seq_printf(file," direct"); + } + seq_printf(file," internal\n"); + + return 0; +} + +static int pstate_hw_open(struct inode * inode, struct file * file) +{ + return single_open(file, pstate_hw_show, NULL); +} + + +static struct file_operations pstate_hw_fops = { + .owner = THIS_MODULE, + .open = pstate_hw_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release +}; + + int pstate_proc_setup(void) { struct proc_dir_entry *proc; - - proc = create_proc_entry("v3-dvfs",0444, palacios_get_procdir()); + struct proc_dir_entry *prochw; + + PAL_PROC_CREATE(proc,"v3-dvfs",0444,palacios_get_procdir(),&pstate_fops); if (!proc) { - ERROR("Failed to create proc entry for p-state control\n"); - return -1; + ERROR("Failed to create proc entry for p-state control\n"); + return -1; } - - proc->proc_fops = &pstate_fops; - + + INFO("/proc/v3vee/v3-dvfs successfully created\n"); + + PAL_PROC_CREATE(prochw,"v3-dvfs-hw",0444,palacios_get_procdir(),&pstate_hw_fops); + + if (!prochw) { + ERROR("Failed to create proc entry for p-state hw info\n"); + return -1; + } + + INFO("/proc/v3vee/v3-dvfs-hw successfully created\n"); + return 0; } - + void pstate_proc_teardown(void) { + remove_proc_entry("v3-dvfs-hw",palacios_get_procdir()); remove_proc_entry("v3-dvfs",palacios_get_procdir()); } /******************************************************************** User interface (ioctls) -********************************************************************/ + ********************************************************************/ static int dvfs_ctrl(unsigned int cmd, unsigned long arg) { struct v3_dvfs_ctrl_request r; if (copy_from_user(&r,(void __user*)arg,sizeof(struct v3_dvfs_ctrl_request))) { - ERROR("Failed to copy DVFS request from user\n"); - return -EFAULT; + ERROR("Failed to copy DVFS request from user\n"); + return -EFAULT; } if (r.pcore >= num_online_cpus()) { - ERROR("Cannot apply DVFS request to pcore %u\n",r.pcore); - return -EFAULT; + ERROR("Cannot apply DVFS request to pcore %u\n",r.pcore); + return -EFAULT; } switch (r.cmd) { - case V3_DVFS_ACQUIRE: { - switch (r.acq_type) { - case V3_DVFS_EXTERNAL: - palacios_xcall(r.pcore,(void (*)(void*))palacios_pstate_ctrl_acquire_external,0); - return 0; - break; - case V3_DVFS_DIRECT: - palacios_xcall(r.pcore,(void (*)(void*))palacios_pstate_ctrl_acquire_direct,0); - return 0; - break; - default: - ERROR("Unknown DVFS acquire type %u\n",r.acq_type); - return -EFAULT; - } - } - break; - case V3_DVFS_RELEASE: { - palacios_xcall(r.pcore,(void (*)(void*))palacios_pstate_ctrl_release,0); - return 0; - } - break; - case V3_DVFS_SETFREQ: { - palacios_xcall(r.pcore,(void (*)(void*))palacios_pstate_ctrl_set_freq,(void*)r.freq_khz); - return 0; - } - break; - case V3_DVFS_SETPSTATE: { - palacios_xcall(r.pcore,(void (*)(void*))palacios_pstate_ctrl_set_pstate_wrapper,(void*)(uint64_t)r.pstate); - return 0; - } - default: { - ERROR("Unknown DVFS command %u\n",r.cmd); - return -EFAULT; - } - break; + case V3_DVFS_ACQUIRE: { + switch (r.acq_type) { + case V3_DVFS_EXTERNAL: + palacios_xcall(r.pcore,(void (*)(void*))palacios_pstate_ctrl_acquire_external, NULL); + return 0; + break; + case V3_DVFS_DIRECT: + palacios_xcall(r.pcore,(void (*)(void*))palacios_pstate_ctrl_acquire_direct, NULL); + return 0; + break; + default: + ERROR("Unknown DVFS acquire type %u\n",r.acq_type); + return -EFAULT; + } + } + break; + case V3_DVFS_RELEASE: { + palacios_xcall(r.pcore,(void (*)(void*))palacios_pstate_ctrl_release, NULL); + return 0; + } + break; + case V3_DVFS_SETFREQ: { + palacios_xcall(r.pcore,(void (*)(void*))palacios_pstate_ctrl_set_freq,(void*)r.freq_khz); + return 0; + } + break; + case V3_DVFS_SETPSTATE: { + palacios_xcall(r.pcore,(void (*)(void*))palacios_pstate_ctrl_set_pstate_wrapper,(void*)(uint64_t)r.pstate); + return 0; + } + default: { + ERROR("Unknown DVFS command %u\n",r.cmd); + return -EFAULT; + } + break; } } @@ -946,7 +1861,7 @@ static struct v3_host_pstate_ctrl_iface hooks = { }; - + static int pstate_ctrl_init(void) { unsigned int cpu; @@ -955,18 +1870,20 @@ static int pstate_ctrl_init(void) pstate_arch_setup(); for (cpu=0;cpu