From: Peter Dinda Date: Tue, 26 Aug 2014 15:41:40 +0000 (-0500) Subject: P-State (DVFS) Enhancements X-Git-Url: http://v3vee.org/palacios/gitweb/gitweb.cgi?p=palacios.git;a=commitdiff_plain;h=366bf119d0307245296101e01bf509afab3eb9f8 P-State (DVFS) Enhancements - Change of pstate interface to be opaque 64 bit number - Nearly complete "direct" implementation for Intel --- diff --git a/linux_module/iface-pstate-ctrl.c b/linux_module/iface-pstate-ctrl.c index ac06b20..46ee834 100644 --- a/linux_module/iface-pstate-ctrl.c +++ b/linux_module/iface-pstate-ctrl.c @@ -21,7 +21,6 @@ #include #include #include -#include #include #include #include @@ -30,6 +29,10 @@ #include #include +// Used to determine the appropriate pstates values on Intel +#include +#include + #include #include "palacios.h" @@ -49,6 +52,12 @@ an ioctl for commanding the implementation and a /proc file for showing current status and capabilities. + What we mean by "pstate" here is the processor's internal + configuration. For AMD, this is defined as being the same as + the ACPI-defined p-state. For Intel, it is not. There, it is the + contents of the perf ctl MSR, which, often, is the frequency id + and voltage id (the multipliers). + */ @@ -101,10 +110,10 @@ static DEFINE_PER_CPU(struct pstate_core_info, core_state); struct pstate_core_funcs { void (*arch_init)(void); void (*arch_deinit)(void); - uint8_t (*get_min_pstate)(void); - uint8_t (*get_max_pstate)(void); - uint8_t (*get_pstate)(void); - void (*set_pstate)(uint8_t pstate); + uint64_t (*get_min_pstate)(void); + uint64_t (*get_max_pstate)(void); + uint64_t (*get_pstate)(void); + void (*set_pstate)(uint64_t pstate); }; struct pstate_machine_info { @@ -217,7 +226,7 @@ static void deinit_arch_amd(void) } -static uint8_t get_pstate_amd(void) +static uint64_t get_pstate_amd(void) { struct p_state_stat_reg_amd pstat; @@ -230,7 +239,7 @@ static uint8_t get_pstate_amd(void) } -static void set_pstate_amd(uint8_t p) +static void set_pstate_amd(uint64_t p) { struct p_state_ctl_reg_amd pctl; pctl.val = 0; @@ -246,7 +255,7 @@ static void set_pstate_amd(uint8_t p) /* * NOTE: HW may change this value at runtime */ -static uint8_t get_max_pstate_amd(void) +static uint64_t get_max_pstate_amd(void) { struct p_state_limit_reg_amd plimits; @@ -256,7 +265,7 @@ static uint8_t get_max_pstate_amd(void) } -static uint8_t get_min_pstate_amd(void) +static uint64_t get_min_pstate_amd(void) { struct p_state_limit_reg_amd plimits; @@ -375,6 +384,21 @@ struct turbo_mode_info_reg_intel { } __attribute__((packed)); } __attribute__((packed)); +// This replicates the critical information in Linux's struct acpi_processor_px +// To make it easier to port to other OSes. +struct intel_pstate_info { + uint64_t freq; // KHz + uint64_t ctrl; // What to write into the _CTL MSR to get this +}; + +// The internal array will be used if we cannot build the table locally +static struct intel_pstate_info *intel_pstate_to_ctrl_internal=0; +static int intel_num_pstates_internal=0; + +// These will either point to the internal array or to a constructed array +static struct intel_pstate_info *intel_pstate_to_ctrl=0; +static int intel_num_pstates=0; + /* CPUID.01:ECX.AES(7) */ static uint8_t supports_pstates_intel(void) @@ -408,6 +432,43 @@ static uint8_t supports_pstates_intel(void) machine_state.have_mwait_ext, machine_state.have_mwait_int ); + + if (machine_state.have_speedstep) { + uint32_t i; + // Build mapping table (from "pstate" (0..) to ctrl value for MSR + if (!(get_cpu_var(processors)) || !(get_cpu_var(processors)->performance) ) { + put_cpu_var(processors); + // no acpi... revert to internal table + intel_pstate_to_ctrl=intel_pstate_to_ctrl_internal; + intel_num_pstates=intel_num_pstates_internal; + } else { + intel_num_pstates = get_cpu_var(processors)->performance->state_count; + if (intel_num_pstates) { + intel_pstate_to_ctrl = palacios_alloc(sizeof(struct intel_pstate_info)*intel_num_pstates); + if (!intel_pstate_to_ctrl) { + ERROR("P-State: Cannot allocate space for mapping...\n"); + intel_num_pstates=0; + } + for (i=0;iperformance->states[i].core_frequency*1000; + intel_pstate_to_ctrl[i].ctrl = get_cpu_var(processors)->performance->states[i].control; + } + + } else { + ERROR("P-State: Strange, machine has ACPI DVFS but no states...\n"); + } + } + put_cpu_var(processors); + INFO("P-State: Intel - State Mapping (%u states) follows\n",intel_num_pstates); + for (i=0;i> 16) & 0x1; put_cpu_var(core_state); @@ -426,6 +489,8 @@ static void init_arch_intel(void) val |= 1 << 16; wrmsrl(MSR_MISC_ENABLE_IA32, val); + //INFO("P-State: write ENABLE=%llx\n",val); + } static void deinit_arch_intel(void) @@ -434,40 +499,47 @@ static void deinit_arch_intel(void) rdmsrl(MSR_MISC_ENABLE_IA32, val); + //INFO("P-State: deinit: ENABLE=%llx\n",val); + val &= ~(1ULL << 16); val |= get_cpu_var(core_state).prior_speedstep << 16; put_cpu_var(core_state); wrmsrl(MSR_MISC_ENABLE_IA32, val); + //INFO("P-state: deinit ENABLE=%llx\n",val); + } /* TODO: Intel P-states require sampling at intervals... */ -static uint8_t get_pstate_intel(void) +static uint64_t get_pstate_intel(void) { uint64_t val; - uint16_t pstate; rdmsrl(MSR_PERF_STAT_IA32,val); - pstate = val & 0xffff; - - INFO("P-State: Get: 0x%llx\n", val); - - // Assume top byte is the FID - //if (pstate & 0xff ) { - // ERROR("P-State: Intel returns confusing pstate %u\n",pstate); - //} + //INFO("P-State: Get: 0x%llx\n", val); // should check if turbo is active, in which case // this value is not the whole story - return (uint8_t) (pstate>>8); + return val; } -static void set_pstate_intel(uint8_t p) +static void set_pstate_intel(uint64_t p) { uint64_t val; + uint64_t ctrl; + + if (intel_num_pstates==0) { + return ; + } else { + if (p>=intel_num_pstates) { + p=intel_num_pstates-1; + } + } + + ctrl=intel_pstate_to_ctrl[p].ctrl; /* ...Intel IDA (dynamic acceleration) if (c->no_turbo && !c->turbo_disabled) { @@ -478,8 +550,10 @@ static void set_pstate_intel(uint8_t p) // fid bits rdmsrl(MSR_PERF_CTL_IA32, val); - val &= ~0xff00ULL; - val |= ((uint64_t)p)<<8; + INFO("P-State: Pre-Set: 0x%llx\n", val); + + val &= ~0xffffULL; + val |= ctrl & 0xffffULL; INFO("P-State: Set: 0x%llx\n", val); @@ -490,24 +564,20 @@ static void set_pstate_intel(uint8_t p) } -static uint8_t get_min_pstate_intel(void) +static uint64_t get_min_pstate_intel(void) { - struct turbo_mode_info_reg_intel t; - - rdmsrl(MSR_PLATFORM_INFO_IA32, t.val); - - return t.reg.min_ratio; + return 0; } -static uint8_t get_max_pstate_intel (void) +static uint64_t get_max_pstate_intel (void) { - struct turbo_mode_info_reg_intel t; - - rdmsrl(MSR_PLATFORM_INFO_IA32, t.val); - - return t.reg.max_noturbo_ratio; + if (intel_num_pstates==0) { + return 0; + } else { + return intel_num_pstates-1; + } } static struct pstate_core_funcs intel_funcs = @@ -773,19 +843,12 @@ static int linux_setup_palacios_governor(void) } -#if 0 -static int linux_deinit(void) -{ - return 0; -} -#endif - static int linux_get_pstate(void) { struct cpufreq_policy * policy = NULL; struct cpufreq_frequency_table *table; - int cpu = get_cpu(); + int cpu = get_cpu(); unsigned int i = 0; unsigned int count = 0; @@ -812,6 +875,8 @@ static int linux_get_pstate(void) } palacios_free(policy); + + put_cpu(); return count; } @@ -950,6 +1015,7 @@ static void init_core(void) { unsigned cpu; struct cpufreq_policy *p; + unsigned int i; DEBUG("P-State Core Init\n"); @@ -980,11 +1046,19 @@ static void init_core(void) get_cpu_var(core_state).min_freq_khz=p->min; get_cpu_var(core_state).max_freq_khz=p->max; get_cpu_var(core_state).cur_freq_khz=p->cur; - cpufreq_cpu_put(p); } + + cpufreq_cpu_put(p); put_cpu_var(core_state); + for (i=0;iperformance->state_count; i++) { + INFO("P-State: %u: freq=%llu ctrl=%llx", + i, + get_cpu_var(processors)->performance->states[i].core_frequency*1000, + get_cpu_var(processors)->performance->states[i].control); + } + put_cpu_var(processors); } @@ -993,9 +1067,7 @@ void palacios_pstate_ctrl_release(void); static void deinit_core(void) { - int cpu; DEBUG("P-State Core Deinit\n"); - cpu = get_cpu(); palacios_pstate_ctrl_release(); } @@ -1030,7 +1102,7 @@ void palacios_pstate_ctrl_get_chars(struct v3_cpu_pstate_chars *c) } -uint8_t palacios_pstate_ctrl_get_pstate(void) +uint64_t palacios_pstate_ctrl_get_pstate(void) { if (get_cpu_var(core_state).mode==V3_PSTATE_DIRECT_CONTROL) { put_cpu_var(core_state); @@ -1045,7 +1117,7 @@ uint8_t palacios_pstate_ctrl_get_pstate(void) } -void palacios_pstate_ctrl_set_pstate(uint8_t p) +void palacios_pstate_ctrl_set_pstate(uint64_t p) { if (get_cpu_var(core_state).mode==V3_PSTATE_DIRECT_CONTROL) { put_cpu_var(core_state); @@ -1087,6 +1159,8 @@ void palacios_pstate_ctrl_set_freq(uint64_t p) static int switch_to_external(void) { + DEBUG("switch from host control to external\n"); + if (!(get_cpu_var(core_state).have_cpufreq)) { put_cpu_var(core_state); ERROR("No cpufreq - cannot switch to external...\n"); @@ -1094,13 +1168,19 @@ static int switch_to_external(void) } put_cpu_var(core_state); - DEBUG("Switching to external control\n"); - return linux_restore_defaults(); + linux_setup_palacios_governor(); + + get_cpu_var(core_state).mode=V3_PSTATE_EXTERNAL_CONTROL; + put_cpu_var(core_state); + + return 0; } static int switch_to_direct(void) { + DEBUG("switch from host control to direct\n"); + if (get_cpu_var(core_state).have_cpufreq) { put_cpu_var(core_state); DEBUG("switch to direct from cpufreq\n"); @@ -1124,6 +1204,8 @@ static int switch_to_direct(void) static int switch_to_internal(void) { + DEBUG("switch from host control to internal\n"); + if (get_cpu_var(core_state).have_cpufreq) { put_cpu_var(core_state); DEBUG("switch to internal on machine with cpu freq\n"); @@ -1146,8 +1228,11 @@ static int switch_from_external(void) return -1; } - DEBUG("Switching from external...\n"); - linux_restore_defaults(); + DEBUG("Switching back to host control from external\n"); + + if (get_cpu_var(core_state).have_cpufreq) { + linux_restore_defaults(); + } get_cpu_var(core_state).mode = V3_PSTATE_HOST_CONTROL; @@ -1159,18 +1244,19 @@ static int switch_from_external(void) static int switch_from_direct(void) { + + DEBUG("Switching back to host control from direct\n"); + + // Set maximum performance, just in case there is no host control + machine_state.funcs->set_pstate(get_cpu_var(core_state).min_pstate); + machine_state.funcs->arch_deinit(); + if (get_cpu_var(core_state).have_cpufreq) { - put_cpu_var(core_state); - DEBUG("Switching back to cpufreq control from direct\n"); linux_restore_defaults(); } get_cpu_var(core_state).mode=V3_PSTATE_HOST_CONTROL; - machine_state.funcs->set_pstate(get_cpu_var(core_state).min_pstate); - - machine_state.funcs->arch_deinit(); - put_cpu_var(core_state); return 0; @@ -1179,9 +1265,10 @@ static int switch_from_direct(void) static int switch_from_internal(void) { + DEBUG("Switching back to host control from internal\n"); + if (get_cpu_var(core_state).have_cpufreq) { - put_cpu_var(core_state); - ERROR("Unimplemented: switch from internal on machine with cpu freq - will just pretend to do so\n"); + // ERROR("Unimplemented: switch from internal on machine with cpu freq - will just pretend to do so\n"); // The implementation would switch back to default policy and governor linux_restore_defaults(); } @@ -1293,7 +1380,7 @@ static int pstate_show(struct seq_file * file, void * v) for (cpu=0;cpucur_hw_pstate, s->mode==V3_PSTATE_HOST_CONTROL ? "host" : s->mode==V3_PSTATE_EXTERNAL_CONTROL ? "external" : @@ -1476,6 +1563,13 @@ static int pstate_ctrl_deinit(void) palacios_xcall(cpu,(void (*)(void *))deinit_core,0); } + + // Free any mapping table we built for Intel + if (intel_pstate_to_ctrl && intel_pstate_to_ctrl != intel_pstate_to_ctrl_internal) { + palacios_free(intel_pstate_to_ctrl); + } + + return 0; } diff --git a/linux_module/iface-pstate-ctrl.h b/linux_module/iface-pstate-ctrl.h index adc6daa..dd3577c 100644 --- a/linux_module/iface-pstate-ctrl.h +++ b/linux_module/iface-pstate-ctrl.h @@ -34,8 +34,8 @@ void palacios_pstate_ctrl_acquire(uint32_t type); void palacios_pstate_ctrl_release(void); -uint8_t palacios_pstate_ctrl_get_pstate(void); -void palacios_pstate_ctrl_set_pstate(uint8_t p); +uint64_t palacios_pstate_ctrl_get_pstate(void); +void palacios_pstate_ctrl_set_pstate(uint64_t p); uint64_t palacios_pstate_ctrl_get_freq(void); void palacios_pstate_ctrl_set_freq(uint64_t f_khz); @@ -52,7 +52,7 @@ struct v3_dvfs_ctrl_request { // Direct for setting pstate directly using module uint32_t pcore; // Which core we mean uint64_t freq_khz; // for setfreq - uint8_t pstate; // for setpstate + uint64_t pstate; // for setpstate }; #endif diff --git a/linux_usr/v3_dvfs.c b/linux_usr/v3_dvfs.c index edd655f..599d3ea 100644 --- a/linux_usr/v3_dvfs.c +++ b/linux_usr/v3_dvfs.c @@ -69,7 +69,7 @@ int main(int argc, char *argv[]) return -1; } } else if (!strcasecmp(cmd,"pstate")) { - if (v3_user_dvfs_set_pstate(core,atoi(arg))) { + if (v3_user_dvfs_set_pstate(core,atoll(arg))) { fprintf(stderr,"Failed to set core %u to pstate %d\n",core,atoi(arg)); rc=-1; } else { diff --git a/linux_usr/v3_user_dvfs.c b/linux_usr/v3_user_dvfs.c index 8d4d930..1db1255 100644 --- a/linux_usr/v3_user_dvfs.c +++ b/linux_usr/v3_user_dvfs.c @@ -48,7 +48,7 @@ int v3_user_dvfs_release(uint32_t core) } -int v3_user_dvfs_set_pstate(uint32_t core, uint8_t pstate) +int v3_user_dvfs_set_pstate(uint32_t core, uint64_t pstate) { struct v3_dvfs_ctrl_request r; diff --git a/palacios/include/interfaces/vmm_pstate_ctrl.h b/palacios/include/interfaces/vmm_pstate_ctrl.h index 7096e17..2e594ca 100644 --- a/palacios/include/interfaces/vmm_pstate_ctrl.h +++ b/palacios/include/interfaces/vmm_pstate_ctrl.h @@ -33,9 +33,11 @@ struct v3_cpu_pstate_chars { uint64_t min_freq_khz; // minimum frequency that can be configed by EXTERANL_CONTROL uint64_t max_freq_khz; // maximum frequency that can be configed by EXTERANL_CONTROL uint64_t cur_freq_khz; // current selected frequency only meaningful under EXTERANL CONTROL - uint8_t min_pstate; // minimum pstate that can be configed by DIRECT_CONTROL - uint8_t max_pstate; // maximum pstate that can be configed by DIRECT_CONTROL - uint8_t cur_pstate; // current selected pstate only meaningful under DIRECT_CONTROL + // Note that "pstate" is an opaque quantity not necessarily the + // ACPI p-state model, although on some processors they are the same + uint64_t min_pstate; // minimum pstate that can be configed by DIRECT_CONTROL + uint64_t max_pstate; // maximum pstate that can be configed by DIRECT_CONTROL + uint64_t cur_pstate; // current selected pstate only meaningful under DIRECT_CONTROL } ; @@ -47,8 +49,8 @@ struct v3_host_pstate_ctrl_iface { void (*acquire)(uint32_t type); void (*release)(void); // pstate control applies if we have acquired DIRECT_CONTROL - void (*set_pstate)(uint8_t pstate); - uint8_t (*get_pstate)(void); + void (*set_pstate)(uint64_t pstate); + uint64_t (*get_pstate)(void); // freq control applies if we have acquired EXTERNAL_CONTROL void (*set_freq)(uint64_t freq_khz); uint64_t (*get_freq)(void); @@ -66,8 +68,8 @@ void v3_get_cpu_pstate_chars(struct v3_cpu_pstate_chars *chars); void v3_acquire_pstate_ctrl(uint32_t type); // for DIRECT_CONTROL -uint8_t v3_get_cpu_pstate(void); -void v3_set_cpu_pstate (uint8_t p); +uint64_t v3_get_cpu_pstate(void); +void v3_set_cpu_pstate (uint64_t p); // for EXTERANL_CONTROL uint64_t v3_get_cpu_freq(void); diff --git a/palacios/src/interfaces/vmm_pstate_ctrl.c b/palacios/src/interfaces/vmm_pstate_ctrl.c index e810b47..a7a9ce8 100644 --- a/palacios/src/interfaces/vmm_pstate_ctrl.c +++ b/palacios/src/interfaces/vmm_pstate_ctrl.c @@ -51,7 +51,7 @@ void v3_acquire_pstate_ctrl(uint32_t type) } -uint8_t v3_get_cpu_pstate(void) +uint64_t v3_get_cpu_pstate(void) { if (pstate_ctrl_hooks && pstate_ctrl_hooks->get_pstate) { return pstate_ctrl_hooks->get_pstate(); @@ -60,7 +60,7 @@ uint8_t v3_get_cpu_pstate(void) } } -void v3_set_cpu_pstate (uint8_t p) +void v3_set_cpu_pstate (uint64_t p) { if (pstate_ctrl_hooks && pstate_ctrl_hooks->set_pstate) { pstate_ctrl_hooks->set_pstate(p);