#include <linux/seq_file.h>
#include <linux/proc_fs.h>
#include <linux/cpufreq.h>
+#include <linux/kernel.h>
+#include <linux/kmod.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/interrupt.h>
#include <asm/processor.h>
#include <asm/msr.h>
#include <asm/msr-index.h>
+// Used to determine the appropriate pstates values on Intel
+#include <linux/acpi.h>
+#include <acpi/processor.h>
+
#include <interfaces/vmm_pstate_ctrl.h>
#include "palacios.h"
#include "linux-exts.h"
/*
- This P-STATE control implementation includes:
-
- - Direct control of Intel and AMD processor pstates
- - External control of processor states via Linux (unimplemented)
- - Internal control of processor states in Palacios (handoff from Linux)
-
- Additionally, it provides a user-space interface for manipulating
- p-state regardless of the host's functionality. This includes
- an ioctl for commanding the implementation and a /proc file for
- showing current status and capabilities.
+ This P-STATE control implementation includes the following modes.
+ You can switch between modes at any time.
+
+ - Internal control of processor states in Palacios (handoff from Linux)
+ When Palacios acuires this control, this module disables Linux cpufreq control
+ and allows code within Palacios unfettered access to the DVFS hardware.
+ - Direct control of Intel and AMD processor pstates using code in this module
+ When you acquire this control, this module disables Linux cpufreq control
+ and directly programs the processor itself in response to your requests
+ - External control of processor states via Linux
+ When you acuire this control, this module uses the Linux cpufreq control
+ to program the processor on your behelf
+ - Host control of processor stastes
+ This is the normal mode of DVFS control (e.g., Linux cpufreq)
+
+ Additionally, it provides a user-space interface for manipulating
+ p-state regardless of the host's functionality. This includes
+ an ioctl for commanding the implementation and a /proc file for
+ showing current status and capabilities. From user space, you can
+ use the Direct, External, and Host modes.
+
+ What we mean by "p-state" here is the processor's internal
+ configuration. For AMD, this is defined as being the same as
+ the ACPI-defined p-state. For Intel, it is not. There, it is the
+ contents of the perf ctl MSR, which is opaque. We try hard to
+ provide "p-states" that go from 0...max, by analogy or equivalence
+ to the ACPI p-states.
*/
+#define PALACIOS_GOVNAME "v3vee"
+#define MAX_PATH_LEN 128
+#define MAX_GOV_NAME_LEN 16
struct pstate_core_info {
// V3_PSTATE_DIRECT_CONTROL
// V3_PSTATE_INTERNAL_CONTROL
uint32_t mode;
-
+
// Apply if we are under the DIRECT state
- uint8_t cur_pstate;
- uint8_t max_pstate;
- uint8_t min_pstate;
+ uint64_t cur_pstate;
+ uint64_t max_pstate;
+ uint64_t min_pstate;
- uint8_t cur_hw_pstate;
+ uint64_t cur_hw_pstate;
// Apply if we are under the EXTERNAL state
+ uint64_t set_freq_khz; // this is the frequency we're hoping to get
uint64_t cur_freq_khz;
uint64_t max_freq_khz;
uint64_t min_freq_khz;
-
- // Intel-specific for DIRECT state
+
+ // Intel-specific
+ uint8_t prior_speedstep;
uint8_t turbo_disabled;
uint8_t no_turbo;
-
+
int have_cpufreq;
-
+
+ // This is where we stash Linux's governor when we make a mode switch
+ char * linux_governor;
+ // We have this so we can restore the original frequency when we started
+ uint64_t original_hz;
+
};
static DEFINE_PER_CPU(struct pstate_core_info, core_state);
+
// These are used to assert DIRECT control over the core pstates
struct pstate_core_funcs {
void (*arch_init)(void);
void (*arch_deinit)(void);
- uint8_t (*get_min_pstate)(void);
- uint8_t (*get_max_pstate)(void);
- uint8_t (*get_pstate)(void);
- void (*set_pstate)(uint8_t pstate);
+ uint64_t (*get_min_pstate)(void);
+ uint64_t (*get_max_pstate)(void);
+ uint64_t (*get_pstate)(void);
+ void (*set_pstate)(uint64_t pstate);
};
struct pstate_machine_info {
enum {INTEL, AMD, OTHER } arch;
int supports_pstates;
+
+
+ // For AMD
+ int have_pstate;
+ int have_coreboost;
+ int have_feedback;
+
+ // For Intel
+ int have_speedstep;
+ int have_opportunistic; // this means "Turbo Boost" or "IDA"
+ int have_policy_hint;
+ int have_hwp; // hardware-controlled performance states
+ int have_hdc; // hardware duty cycling
+ int have_mwait_ext; // mwait power extensions
+ int have_mwait_int; // mwait wakes on interrupt
+
+ // for both
+ int have_pstate_hw_coord; // mperf/aperf
+
// used for DIRECT control
struct pstate_core_funcs *funcs;
+
};
static struct pstate_machine_info machine_state;
/****************************************************
- AMD DIRECT CONTROL
-***************************************************/
+ AMD DIRECT CONTROL
+ ***************************************************/
/* AMD Programmer's Manual Vol 2 (Rev 3, 2013), Sec. 17.1, pp.557 */
#define MSR_PSTATE_LIMIT_REG_AMD 0xc0010061
/* CPUID Fn8000_0007_EDX[HwPstate(7)] = 1 */
static uint8_t supports_pstates_amd (void)
{
+ int i;
+ int mapwrong=0;
+ int amd_num_pstates;
+
uint32_t eax, ebx, ecx, edx;
+
cpuid(0x80000007, &eax, &ebx, &ecx, &edx);
- return !!(edx & (1 << 7));
+ machine_state.have_pstate = !!(edx & (1 << 7));
+ machine_state.have_coreboost = !!(edx & (1<<9));
+ machine_state.have_feedback = !!(edx & (1<<11));
+
+ cpuid(0x6, &eax, &ebx, &ecx, &edx);
+ machine_state.have_pstate_hw_coord = !!(ecx & 1);
+
+ INFO("P-State: AMD: Pstates=%d Coreboost=%d Feedback=%d PstateHWCoord=%d\n",
+ machine_state.have_pstate,
+ machine_state.have_coreboost,
+ machine_state.have_feedback,
+ machine_state.have_pstate_hw_coord);
+
+ amd_num_pstates = get_cpu_var(processors)->performance->state_count;
+ if (amd_num_pstates) {
+ for (i=0;i<amd_num_pstates;i++) {
+ INFO("P-State: %u: freq=%llu ctrl=%llx%s\n",
+ i,
+ get_cpu_var(processors)->performance->states[i].core_frequency*1000,
+ get_cpu_var(processors)->performance->states[i].control,
+ get_cpu_var(processors)->performance->states[i].control != i ? (mapwrong=1, " ALERT - CTRL MAPPING NOT 1:1") : "");
+ }
+ }
+ if (mapwrong) {
+ ERROR("P-State: AMD: mapping of pstate and control is not 1:1 on this processor - we will probably not work corrrectly\n");
+ }
+
+ return machine_state.have_pstate;
+
+
}
+
static void init_arch_amd(void)
{
/* KCH: nothing to do here */
}
+
static void deinit_arch_amd(void)
{
/* KCH: nothing to do here */
}
-static uint8_t get_pstate_amd(void)
+
+static uint64_t get_pstate_amd(void)
{
struct p_state_stat_reg_amd pstat;
return pstat.reg.pstate;
}
-static void set_pstate_amd(uint8_t p)
+
+static void set_pstate_amd(uint64_t p)
{
struct p_state_ctl_reg_amd pctl;
+
+ if (p>get_cpu_var(core_state).max_pstate) {
+ p=get_cpu_var(core_state).max_pstate;
+ }
+ put_cpu_var(core_state);
+
pctl.val = 0;
pctl.reg.cmd = p;
put_cpu_var(core_state);
}
+
/*
* NOTE: HW may change this value at runtime
*/
-static uint8_t get_max_pstate_amd(void)
+static uint64_t get_max_pstate_amd(void)
{
struct p_state_limit_reg_amd plimits;
}
-static uint8_t get_min_pstate_amd(void)
+static uint64_t get_min_pstate_amd(void)
{
struct p_state_limit_reg_amd plimits;
/***********************************************************
INTEL DIRECT CONTROL
-**********************************************************/
+ **********************************************************/
+
+/*
+ This implementation uses SpeedStep, but does check
+ to see if the other features (MPERF/APERF, Turbo/IDA, HWP)
+ are available.
+*/
/* Intel System Programmer's Manual Vol. 3B, 14-2 */
#define MSR_MPERF_IA32 0x000000e7
#define MSR_NHM_TURBO_RATIO_LIMIT 0x000001ad
#define MSR_PLATFORM_INFO_IA32 0x000000ce
#define MSR_PERF_CTL_IA32 0x00000199
+#define MSR_PERF_STAT_IA32 0x00000198
+#define MSR_ENERY_PERF_BIAS_IA32 0x000001b0
+
+
+/* Note that the actual meaning of the pstate
+ in the control and status registers is actually
+ implementation dependent, unlike AMD. The "official"
+ way to figure it out the mapping from pstate to
+ these values is via ACPI. What is written in the register
+ is an "id" of an operation point
+
+ "Often", the 16 bit field consists of a high order byte
+ which is the frequency (the multiplier) and the low order
+ byte is the voltage.
+ */
+// MSR_PERF_CTL_IA32 r/w
+struct perf_ctl_reg_intel {
+ union {
+ uint64_t val;
+ struct {
+ // This is the target
+ // Note, not the ACPI pstate, but
+ // Intel's notion of pstate is that it's opaque
+ // for lots of implementations it seems to be
+ // frequency_id : voltage_id
+ // where frequency_id is typically the multiplier
+ uint16_t pstate : 16;
+ uint16_t reserved : 16;
+ // set to 1 to *disengage* dynamic acceleration
+ // Note that "IDA" and "Turbo" use the same interface
+ uint16_t dynamic_accel_disable : 1;
+ uint32_t reserved2 : 31;
+ } reg;
+ } __attribute__((packed));
+} __attribute__((packed));
+// MSR_PERF_STAT_IA32 r
+struct perf_stat_reg_intel {
+ union {
+ uint64_t val;
+ struct {
+ // this is the current
+ uint16_t pstate : 16;
+ uint64_t reserved : 48;
+ } reg;
+ } __attribute__((packed));
+} __attribute__((packed));
+// MSR_ENERGY_PERF_BIAS_IA32 r/w
+struct enery_perf_bias_reg_intel {
+ union {
+ uint64_t val;
+ struct {
+ // this is the current
+ uint8_t policy_hint : 4;
+ uint64_t reserved : 60;
+ } reg;
+ } __attribute__((packed));
+} __attribute__((packed));
+// MSR_PLATFORM_INFO
struct turbo_mode_info_reg_intel {
union {
uint64_t val;
struct {
- uint8_t rsvd0;
- uint8_t max_noturbo_ratio;
- uint16_t rsvd1 : 12;
- uint8_t ratio_limit : 1;
+ uint8_t rsvd0 : 8;
+ uint8_t max_noturbo_ratio : 8;
+ uint8_t rsvd1 : 7;
+ uint8_t ppin_cap : 1;
+ uint8_t rsvd2 : 4;
+ uint8_t ratio_limit : 1;
uint8_t tdc_tdp_limit : 1;
- uint16_t rsvd2 : 10;
- uint8_t min_ratio;
- uint16_t rsvd3;
+ uint16_t rsvd3 : 10;
+ uint8_t min_ratio : 8;
+ uint16_t rsvd4 : 16;
} reg;
} __attribute__((packed));
} __attribute__((packed));
-
+
+// This replicates the critical information in Linux's struct acpi_processor_px
+// To make it easier to port to other OSes.
+struct intel_pstate_info {
+ uint64_t freq; // KHz
+ uint64_t ctrl; // What to write into the _CTL MSR to get this
+};
+
+// The internal array will be used if we cannot build the table locally
+static struct intel_pstate_info *intel_pstate_to_ctrl_internal=0;
+static int intel_num_pstates_internal=0;
+
+// These will either point to the internal array or to a constructed array
+static struct intel_pstate_info *intel_pstate_to_ctrl=0;
+static int intel_num_pstates=0;
+
/* CPUID.01:ECX.AES(7) */
static uint8_t supports_pstates_intel(void)
{
/* NOTE: CPUID.06H:ECX.SETBH[bit 3] is set and it also implies the presence of a new architectural MSR called IA32_ENERGY_PERF_BIAS (1B0H).
- */
+ */
uint32_t eax, ebx, ecx, edx;
+
cpuid(0x1, &eax, &ebx, &ecx, &edx);
- return !!(ecx & (1 << 7));
+ machine_state.have_speedstep = !!(ecx & (1 << 7));
+
+ cpuid(0x6, &eax, &ebx, &ecx, &edx);
+ machine_state.have_pstate_hw_coord = !!(ecx & 1); // ?
+ machine_state.have_opportunistic = !!(eax & 1<<1);
+ machine_state.have_policy_hint = !!(ecx & 1<<3);
+ machine_state.have_hwp = !!(eax & 1<<7);
+ machine_state.have_hdc = !!(eax & 1<<13);
+
+ cpuid(0x5, &eax, &ebx, &ecx, &edx);
+ machine_state.have_mwait_ext = !!(ecx & 1);
+ machine_state.have_mwait_int = !!(ecx & 1<<1);
+
+
+ // Note we test all the available hardware features documented as of August 2014
+ // We are only currently using speed_step, however.
+
+ INFO("P-State: Intel: Speedstep=%d, PstateHWCoord=%d, Opportunistic=%d PolicyHint=%d HWP=%d HDC=%d, MwaitExt=%d MwaitInt=%d \n",
+ machine_state.have_speedstep,
+ machine_state.have_pstate_hw_coord,
+ machine_state.have_opportunistic,
+ machine_state.have_policy_hint,
+ machine_state.have_hwp,
+ machine_state.have_hdc,
+ machine_state.have_mwait_ext,
+ machine_state.have_mwait_int );
+
+
+ if (machine_state.have_speedstep) {
+ uint32_t i;
+ // Build mapping table (from "pstate" (0..) to ctrl value for MSR
+ if (!(get_cpu_var(processors)) || !(get_cpu_var(processors)->performance) ) {
+ put_cpu_var(processors);
+ // no acpi... revert to internal table
+ intel_pstate_to_ctrl=intel_pstate_to_ctrl_internal;
+ intel_num_pstates=intel_num_pstates_internal;
+ } else {
+ intel_num_pstates = get_cpu_var(processors)->performance->state_count;
+ if (intel_num_pstates) {
+ intel_pstate_to_ctrl = palacios_alloc(sizeof(struct intel_pstate_info)*intel_num_pstates);
+ if (!intel_pstate_to_ctrl) {
+ ERROR("P-State: Cannot allocate space for mapping...\n");
+ intel_num_pstates=0;
+ }
+ for (i=0;i<intel_num_pstates;i++) {
+ intel_pstate_to_ctrl[i].freq = get_cpu_var(processors)->performance->states[i].core_frequency*1000;
+ intel_pstate_to_ctrl[i].ctrl = get_cpu_var(processors)->performance->states[i].control;
+ }
+
+ } else {
+ ERROR("P-State: Strange, machine has ACPI DVFS but no states...\n");
+ }
+ }
+ put_cpu_var(processors);
+ INFO("P-State: Intel - State Mapping (%u states) follows\n",intel_num_pstates);
+ for (i=0;i<intel_num_pstates;i++) {
+ INFO("P-State: Intel Mapping %u: freq=%llu ctrl=%llx\n",
+ i, intel_pstate_to_ctrl[i].freq,intel_pstate_to_ctrl[i].ctrl);
+ }
+ } else {
+ INFO("P-State: Intel: No speedstep here\n");
+ }
+
+
+ return machine_state.have_speedstep;
}
rdmsrl(MSR_MISC_ENABLE_IA32, val);
- val |= 1 << 16;
+ //INFO("P-State: prior ENABLE=%llx\n",val);
+ // store prior speedstep setting
+ get_cpu_var(core_state).prior_speedstep=(val >> 16) & 0x1;
+ put_cpu_var(core_state);
+
+ // enable speedstep (probably already on)
+ val |= 1 << 16;
wrmsrl(MSR_MISC_ENABLE_IA32, val);
+ //INFO("P-State: write ENABLE=%llx\n",val);
+
}
static void deinit_arch_intel(void)
{
- // ??
+ uint64_t val;
+
+ rdmsrl(MSR_MISC_ENABLE_IA32, val);
+
+ //INFO("P-State: deinit: ENABLE=%llx\n",val);
+
+ val &= ~(1ULL << 16);
+ val |= get_cpu_var(core_state).prior_speedstep << 16;
+ put_cpu_var(core_state);
+
+ wrmsrl(MSR_MISC_ENABLE_IA32, val);
+
+ //INFO("P-state: deinit ENABLE=%llx\n",val);
+
}
/* TODO: Intel P-states require sampling at intervals... */
-static uint8_t get_pstate_intel(void)
+static uint64_t get_pstate_intel(void)
{
- uint8_t pstate;
+ uint64_t val;
- // This should read the HW...
- pstate=get_cpu_var(core_state).cur_pstate;
- put_cpu_var(core_state);
- return pstate;
+ rdmsrl(MSR_PERF_STAT_IA32,val);
+
+ //INFO("P-State: Get: 0x%llx\n", val);
+
+ // should check if turbo is active, in which case
+ // this value is not the whole story
+
+ return val;
}
-
-static void set_pstate_intel(uint8_t p)
+
+static void set_pstate_intel(uint64_t p)
{
- uint64_t val = ((uint64_t)p) << 8;
+ uint64_t val;
+ uint64_t ctrl;
- /* ...Intel IDA (dynamic acceleration)
- if (c->no_turbo && !c->turbo_disabled) {
- val |= 1 << 32;
+ if (intel_num_pstates==0) {
+ return ;
+ } else {
+ if (p>=intel_num_pstates) {
+ p=intel_num_pstates-1;
+ }
}
- */
+
+ ctrl=intel_pstate_to_ctrl[p].ctrl;
+
+ /* ...Intel IDA (dynamic acceleration)
+ if (c->no_turbo && !c->turbo_disabled) {
+ val |= 1 << 32;
+ }
+ */
+ // leave all bits along expect for the likely
+ // fid bits
+
+ rdmsrl(MSR_PERF_CTL_IA32, val);
+ //INFO("P-State: Pre-Set: 0x%llx\n", val);
+
+ val &= ~0xffffULL;
+ val |= ctrl & 0xffffULL;
+
+ //INFO("P-State: Set: 0x%llx\n", val);
wrmsrl(MSR_PERF_CTL_IA32, val);
}
-static uint8_t get_min_pstate_intel(void)
+static uint64_t get_min_pstate_intel(void)
{
- struct turbo_mode_info_reg_intel t;
-
- rdmsrl(MSR_PLATFORM_INFO_IA32, t.val);
-
- return t.reg.min_ratio;
+ return 0;
}
-static uint8_t get_max_pstate_intel (void)
+static uint64_t get_max_pstate_intel (void)
{
- struct turbo_mode_info_reg_intel t;
-
- rdmsrl(MSR_PLATFORM_INFO_IA32, t.val);
-
- return t.reg.max_noturbo_ratio;
+ if (intel_num_pstates==0) {
+ return 0;
+ } else {
+ return intel_num_pstates-1;
+ }
}
static struct pstate_core_funcs intel_funcs =
/***********************************************
Arch determination and setup
-***********************************************/
-
+ ***********************************************/
+
static inline void cpuid_string (uint32_t id, uint32_t dest[4])
{
asm volatile("cpuid"
- :"=a"(*dest),"=b"(*(dest+1)),"=c"(*(dest+2)),"=d"(*(dest+3))
- :"a"(id));
+ :"=a"(*dest),"=b"(*(dest+1)),"=c"(*(dest+2)),"=d"(*(dest+3))
+ :"a"(id));
}
-
+
static int get_cpu_vendor (char name[13])
{
uint32_t dest[4];
uint32_t maxid;
-
+
cpuid_string(0,dest);
maxid=dest[0];
((uint32_t*)name)[0]=dest[1];
((uint32_t*)name)[1]=dest[3];
((uint32_t*)name)[2]=dest[2];
name[12]=0;
-
+
return maxid;
}
static int is_intel (void)
{
- char name[13];
- get_cpu_vendor(name);
- return !strcmp(name,"GenuineIntel");
+ char name[13];
+ get_cpu_vendor(name);
+ return !strcmp(name,"GenuineIntel");
}
static int is_amd (void)
{
- char name[13];
- get_cpu_vendor(name);
- return !strcmp(name,"AuthenticAMD");
+ char name[13];
+ get_cpu_vendor(name);
+ return !strcmp(name,"AuthenticAMD");
}
static int pstate_arch_setup(void)
{
-
+
if (is_amd()) {
machine_state.arch = AMD;
machine_state.funcs = &amd_funcs;
- machine_state.supports_pstates = supports_pstates_amd();
- INFO("PSTATE: P-State initialized for AMD\n");
+ machine_state.supports_pstates = supports_pstates_amd();
+ INFO("PSTATE: P-State initialized for AMD\n");
} else if (is_intel()) {
machine_state.arch = INTEL;
machine_state.funcs = &intel_funcs;
- machine_state.supports_pstates = supports_pstates_intel();
+ machine_state.supports_pstates = supports_pstates_intel();
INFO("PSTATE: P-State initialized for INTEL (Work in progress...)\n");
return 0;
-
+
} else {
- machine_state.arch = OTHER;
- machine_state.funcs = NULL;
- machine_state.supports_pstates = 0;
+ machine_state.arch = OTHER;
+ machine_state.funcs = NULL;
+ machine_state.supports_pstates = 0;
INFO("PSTATE: P-state control: No support for direct control on this architecture\n");
return 0;
}
-
+
return 0;
}
/******************************************************************
Linux Interface
-*****************************************************************/
+ *****************************************************************/
+
+static unsigned cpus_using_v3_governor;
+static DEFINE_MUTEX(v3_governor_mutex);
-#if 0
-// The purpose of the stub governor is the pretend to keep
-// the processor at the maximum frequency, while we manipulate he
-// processor ccre directly
+/* KCH: this will tell us when there is an actual frequency transition */
+static int v3_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
+ void *data)
+{
+ struct cpufreq_freqs *freq = data;
+
+ if (per_cpu(core_state, freq->cpu).mode != V3_PSTATE_EXTERNAL_CONTROL) {
+ return 0;
+ }
+
+ if (val == CPUFREQ_POSTCHANGE) {
+ DEBUG("P-State: frequency change took effect on cpu %u (now %u kHz)\n",
+ freq->cpu, freq->new);
+ per_cpu(core_state, freq->cpu).cur_freq_khz = freq->new;
+ }
+
+ return 0;
+
+}
+
+
+static struct notifier_block v3_cpufreq_notifier_block = {
+ .notifier_call = v3_cpufreq_notifier
+};
+
+
+/*
+ * This stub governor is simply a placeholder for preventing
+ * frequency changes from the Linux side. For now, we simply leave
+ * the frequency as is when we acquire control.
+ */
static int governor_run(struct cpufreq_policy *policy, unsigned int event)
{
- switch (event) {
- case CPUFREQ_GOV_START:
- case CPUFREQ_GOV_STOP:
- cpu_freq_driver_target(policy, policy->max_freq);
+ unsigned cpu = policy->cpu;
- case CPUFREQ_GOV_LIMITS:
+ switch (event) {
+ /* we can't use cpufreq_driver_target here as it can result
+ * in a circular dependency, so we'll keep the current frequency as is
+ */
+ case CPUFREQ_GOV_START:
+ BUG_ON(!policy->cur);
+
+ mutex_lock(&v3_governor_mutex);
+
+ if (cpus_using_v3_governor == 0) {
+ cpufreq_register_notifier(&v3_cpufreq_notifier_block,
+ CPUFREQ_TRANSITION_NOTIFIER);
+ }
+
+ cpus_using_v3_governor++;
+
+ per_cpu(core_state, cpu).set_freq_khz = policy->cur;
+ per_cpu(core_state, cpu).cur_freq_khz = policy->cur;
+ per_cpu(core_state, cpu).max_freq_khz = policy->max;
+ per_cpu(core_state, cpu).min_freq_khz = policy->min;
+
+ mutex_unlock(&v3_governor_mutex);
+ break;
+ case CPUFREQ_GOV_STOP:
+ mutex_lock(&v3_governor_mutex);
+
+ cpus_using_v3_governor--;
+
+ if (cpus_using_v3_governor == 0) {
+ cpufreq_unregister_notifier(
+ &v3_cpufreq_notifier_block,
+ CPUFREQ_TRANSITION_NOTIFIER);
+ }
+
+ per_cpu(core_state, cpu).set_freq_khz = 0;
+ per_cpu(core_state, cpu).cur_freq_khz = 0;
+ per_cpu(core_state, cpu).max_freq_khz = 0;
+ per_cpu(core_state, cpu).min_freq_khz = 0;
+
+ mutex_unlock(&v3_governor_mutex);
+ break;
+ case CPUFREQ_GOV_LIMITS:
+ /* do nothing */
+ break;
+ default:
+ ERROR("Undefined governor command (%u)\n", event);
+ return -1;
}
+
+ return 0;
}
+
static struct cpufreq_governor stub_governor =
{
- .name="PALACIOS_STUB",
- .governor=governor_run,
- .owner=.THIS_MODULE,
+ .name = PALACIOS_GOVNAME,
+ .governor = governor_run,
+ .owner = THIS_MODULE,
+};
+
+
+static struct workqueue_struct *pstate_wq;
+
+typedef struct {
+ struct work_struct work;
+ uint64_t freq;
+} pstate_work_t;
+
+
+
+static inline void pstate_register_linux_governor(void)
+{
+ cpufreq_register_governor(&stub_governor);
+}
+
+
+static inline void pstate_unregister_linux_governor(void)
+{
+ cpufreq_unregister_governor(&stub_governor);
}
-static void linux_init(void)
+
+static int pstate_linux_init(void)
{
- // get_policy
- //
- // change to userspace governor - or change to our do nothing governor? (call set_speed)
- // stash the old governor
- // tell governor to do max freq
+ pstate_register_linux_governor();
+ pstate_wq = create_workqueue("v3vee_pstate_wq");
+ if (!pstate_wq) {
+ ERROR("Could not create work queue\n");
+ goto out_err;
+ }
+
+ return 0;
+out_err:
+ pstate_unregister_linux_governor();
+ return -1;
}
-static void linux_deinit(void)
+
+static void pstate_linux_deinit(void)
{
+ pstate_unregister_linux_governor();
+ flush_workqueue(pstate_wq);
+ destroy_workqueue(pstate_wq);
}
-static uint8_t linux_get_pstate(void)
+
+static int get_current_governor(char **buf, unsigned int cpu)
{
+ struct cpufreq_policy * policy = palacios_alloc(sizeof(struct cpufreq_policy));
+ char * govname = NULL;
+
+ if (!policy) {
+ ERROR("could not allocate cpufreq_policy\n");
+ return -1;
+ }
+
+ if (cpufreq_get_policy(policy, cpu) != 0) {
+ ERROR("Could not get current cpufreq policy\n");
+ goto out_err;
+ }
+
+ /* We're in interrupt context, should probably not wait here */
+ govname = palacios_alloc(MAX_GOV_NAME_LEN);
+ if (!govname) {
+ ERROR("Could not allocate space for governor name\n");
+ goto out_err;
+ }
+
+ strncpy(govname, policy->governor->name, MAX_GOV_NAME_LEN);
+ govname[MAX_GOV_NAME_LEN-1] = 0;
+
+ get_cpu_var(core_state).linux_governor = govname;
+ put_cpu_var(core_state);
+
+ *buf = govname;
+
+ palacios_free(policy);
+
return 0;
+
+out_err:
+ palacios_free(policy);
+ return -1;
}
-static void linux_set_pstate(uint8_t p)
+
+/* passed to the userspacehelper interface for cleanup */
+static void gov_switch_cleanup(struct subprocess_info * s)
{
+ palacios_free(s->argv[2]);
+ palacios_free(s->argv);
}
-static void linux_restore_defaults(void)
+
+/*
+ * Switch governors
+ * @s - the governor to switch to
+ * TODO: this should probably be submitted to a work queue
+ * so we don't have to run it in interrupt context
+ */
+static int governor_switch(char * s, unsigned int cpu)
{
-}
+ char * path_str = NULL;
+ char ** argv = NULL;
+
+ static char * envp[] = {
+ "HOME=/",
+ "TERM=linux",
+ "PATH=/sbin:/bin:/usr/sbin:/usr/bin", NULL };
+
+
+ argv = palacios_alloc(4*sizeof(char*));
+ if (!argv) {
+ ERROR("Couldn't allocate argv struct\n");
+ return -1;
+ }
+ path_str = palacios_alloc(MAX_PATH_LEN);
+ if (!path_str) {
+ ERROR("Couldn't allocate path string\n");
+ goto out_freeargv;
+ }
+ memset(path_str, 0, MAX_PATH_LEN);
+
+ snprintf(path_str, MAX_PATH_LEN, "echo %s > /sys/devices/system/cpu/cpu%u/cpufreq/scaling_governor", s, cpu);
+
+ argv[0] = "/bin/sh";
+ argv[1] = "-c";
+ argv[2] = path_str;
+ argv[3] = NULL;
+
+ /* KCH: we can't wait here to actually see if we succeeded, we're in interrupt context */
+
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(3,9,0)
+ return call_usermodehelper_fns("/bin/sh", argv, envp, UMH_NO_WAIT, NULL, gov_switch_cleanup, NULL);
+#else
+ {
+ struct subprocess_info *sp;
+
+ sp = call_usermodehelper_setup("/bin/sh", argv, envp, GFP_ATOMIC, NULL, gov_switch_cleanup, NULL);
+ if (!sp) {
+ goto out_freeargv;
+ }
+
+ return call_usermodehelper_exec(sp,0);
+ }
#endif
+
+out_freeargv:
+ palacios_free(argv);
+ return -1;
+}
+
+
+static inline void free_linux_governor(void)
+{
+ palacios_free(get_cpu_var(core_state).linux_governor);
+ put_cpu_var(core_state);
+}
+
+
+static int linux_setup_palacios_governor(void)
+{
+ char * gov;
+ unsigned int cpu = get_cpu();
+ put_cpu();
+
+ /* KCH: we assume the v3vee governor is already
+ * registered with kernel by this point
+ */
+
+ if (get_current_governor(&gov, cpu) < 0) {
+ ERROR("Could not get current governor\n");
+ return -1;
+ }
+
+ DEBUG("saving current governor (%s)\n", gov);
+
+ get_cpu_var(core_state).linux_governor = gov;
+ put_cpu_var(core_state);
+
+ DEBUG("setting the new governor (%s)\n", PALACIOS_GOVNAME);
+
+ /* set the new one to ours */
+
+ if (governor_switch(PALACIOS_GOVNAME, cpu) < 0) {
+ ERROR("Could not set governor to (%s)\n", PALACIOS_GOVNAME);
+ return -1;
+ }
+
+ return 0;
+}
+
+
+
+static uint64_t linux_get_pstate(void)
+{
+ struct cpufreq_policy * policy = NULL;
+ struct cpufreq_frequency_table *table;
+ unsigned int i = 0;
+ unsigned int count = 0;
+ unsigned int cpu = get_cpu();
+ put_cpu();
+
+
+ policy = palacios_alloc(sizeof(struct cpufreq_policy));
+ if (!policy) {
+ ERROR("Could not allocate policy struct\n");
+ return -1;
+ }
+
+ cpufreq_get_policy(policy, cpu);
+ table = cpufreq_frequency_get_table(cpu);
+
+ for (i = 0; (table[i].frequency != CPUFREQ_TABLE_END); i++) {
+
+ if (table[i].frequency == CPUFREQ_ENTRY_INVALID) {
+ continue;
+ }
+
+ if (table[i].frequency == policy->cur) {
+ break;
+ }
+
+ count++;
+ }
+
+ palacios_free(policy);
+
+ put_cpu();
+ return count;
+}
+
+
+static uint64_t linux_get_freq(void)
+{
+ uint64_t freq;
+ struct cpufreq_policy * policy = NULL;
+ unsigned int cpu = get_cpu();
+ put_cpu();
+
+ policy = palacios_alloc(sizeof(struct cpufreq_policy));
+ if (!policy) {
+ ERROR("Could not allocate policy struct\n");
+ return -1;
+ }
+
+ if (cpufreq_get_policy(policy, cpu)) {
+ ERROR("Could not get current policy\n");
+ return -1;
+ }
+
+ freq=policy->cur;
+
+ palacios_free(policy);
+
+ return freq;
+}
+
+static void
+pstate_switch_workfn (struct work_struct *work)
+{
+ pstate_work_t * pwork = (pstate_work_t*)work;
+ struct cpufreq_policy * policy = NULL;
+ uint64_t freq;
+ unsigned int cpu = get_cpu();
+ put_cpu();
+
+ mutex_lock(&v3_governor_mutex);
+
+ policy = palacios_alloc(sizeof(struct cpufreq_policy));
+ if (!policy) {
+ ERROR("Could not allocate space for cpufreq policy\n");
+ goto out;
+ }
+
+ if (cpufreq_get_policy(policy, cpu) != 0) {
+ ERROR("Could not get cpufreq policy\n");
+ goto out1;
+ }
+
+ freq = pwork->freq;
+ get_cpu_var(core_state).set_freq_khz = freq;
+
+ if (freq < get_cpu_var(core_state).min_freq_khz) {
+ freq = get_cpu_var(core_state).min_freq_khz;
+ }
+ if (freq > get_cpu_var(core_state).max_freq_khz) {
+ freq = get_cpu_var(core_state).max_freq_khz;
+ }
+ put_cpu_var(core_state);
+
+ INFO("P-state: requesting frequency change on core %u to %llu\n", cpu, freq);
+ __cpufreq_driver_target(policy, freq, CPUFREQ_RELATION_L);
+
+out1:
+ palacios_free(policy);
+out:
+ palacios_free(work);
+ mutex_unlock(&v3_governor_mutex);
+}
+
+
+static int linux_set_pstate(uint64_t p)
+{
+ struct cpufreq_policy * policy = NULL;
+ struct cpufreq_frequency_table *table;
+ pstate_work_t * work = NULL;
+ unsigned int i = 0;
+ unsigned int count = 0;
+ int state_set = 0;
+ int last_valid = 0;
+ unsigned int cpu = get_cpu();
+ put_cpu();
+
+ policy = palacios_alloc(sizeof(struct cpufreq_policy));
+ if (!policy) {
+ ERROR("Could not allocate policy struct\n");
+ return -1;
+ }
+
+ work = (pstate_work_t*)palacios_alloc(sizeof(pstate_work_t));
+ if (!work) {
+ ERROR("Could not allocate work struct\n");
+ goto out_err;
+ }
+
+ if (cpufreq_get_policy(policy, cpu)) {
+ ERROR("Could not get current policy\n");
+ goto out_err1;
+ }
+ table = cpufreq_frequency_get_table(cpu);
+
+ for (i = 0; (table[i].frequency != CPUFREQ_TABLE_END); i++) {
+
+ if (table[i].frequency == CPUFREQ_ENTRY_INVALID) {
+ continue;
+ }
+
+ if (count == p) {
+
+ INIT_WORK((struct work_struct*)work, pstate_switch_workfn);
+ work->freq = table[i].frequency;
+ queue_work(pstate_wq, (struct work_struct*)work);
+
+ state_set = 1;
+ break;
+ }
+
+ count++;
+ last_valid = i;
+ }
+
+ /* we need to deal with the case in which we get a number > max pstate */
+ if (!state_set) {
+ INIT_WORK((struct work_struct*)work, pstate_switch_workfn);
+ work->freq = table[last_valid].frequency;
+ queue_work(pstate_wq, (struct work_struct*)work);
+ }
+
+ palacios_free(policy);
+ return 0;
+
+out_err1:
+ palacios_free(work);
+out_err:
+ palacios_free(policy);
+ return -1;
+}
+
+
+static int linux_set_freq(uint64_t f)
+{
+ struct cpufreq_policy * policy = NULL;
+ pstate_work_t * work = NULL;
+ uint64_t freq;
+ unsigned int cpu = get_cpu();
+ put_cpu();
+
+ policy = palacios_alloc(sizeof(struct cpufreq_policy));
+ if (!policy) {
+ ERROR("Could not allocate policy struct\n");
+ return -1;
+ }
+
+ work = (pstate_work_t*)palacios_alloc(sizeof(pstate_work_t));
+ if (!work) {
+ ERROR("Could not allocate work struct\n");
+ goto out_err;
+ }
+
+ if (cpufreq_get_policy(policy, cpu) != 0) {
+ ERROR("Could not get cpufreq policy\n");
+ goto out_err1;
+ }
+
+ if (f < policy->min) {
+ freq = policy->min;
+ } else if (f > policy->max) {
+ freq = policy->max;
+ } else {
+ freq = f;
+ }
+
+ INIT_WORK((struct work_struct*)work, pstate_switch_workfn);
+ work->freq = freq;
+ queue_work(pstate_wq, (struct work_struct*)work);
+
+ palacios_free(policy);
+ return 0;
+
+out_err1:
+ palacios_free(work);
+out_err:
+ palacios_free(policy);
+ return -1;
+}
+
+
+static int linux_restore_defaults(void)
+{
+ char * gov = NULL;
+ unsigned int cpu = get_cpu();
+ put_cpu();
+
+ gov = get_cpu_var(core_state).linux_governor;
+ put_cpu_var(core_state);
+
+ DEBUG("restoring previous governor (%s)\n", gov);
+
+ if (governor_switch(gov, cpu) < 0) {
+ ERROR("Could not restore governor to (%s)\n", gov);
+ goto out_err;
+ }
+
+ free_linux_governor();
+ return 0;
+
+out_err:
+ free_linux_governor();
+ return -1;
+}
+
/******************************************************************
Generic Interface as provided to Palacios and to the rest of the
module
-******************************************************************/
+ ******************************************************************/
static void init_core(void)
{
struct cpufreq_policy *p;
- DEBUG("P-State Core Init\n");
+ //DEBUG("P-State Core Init\n");
get_cpu_var(core_state).mode = V3_PSTATE_HOST_CONTROL;
get_cpu_var(core_state).cur_pstate = 0;
-
+
if (machine_state.funcs) {
- get_cpu_var(core_state).min_pstate = machine_state.funcs->get_min_pstate();
- get_cpu_var(core_state).max_pstate = machine_state.funcs->get_max_pstate();
+ get_cpu_var(core_state).min_pstate = machine_state.funcs->get_min_pstate();
+ get_cpu_var(core_state).max_pstate = machine_state.funcs->get_max_pstate();
} else {
- get_cpu_var(core_state).min_pstate = 0;
- get_cpu_var(core_state).max_pstate = 0;
+ get_cpu_var(core_state).min_pstate = 0;
+ get_cpu_var(core_state).max_pstate = 0;
}
p = cpufreq_cpu_get(cpu);
if (!p) {
- get_cpu_var(core_state).have_cpufreq = 0;
- get_cpu_var(core_state).min_freq_khz=0;
- get_cpu_var(core_state).max_freq_khz=0;
- get_cpu_var(core_state).cur_freq_khz=0;
+ get_cpu_var(core_state).have_cpufreq = 0;
+ get_cpu_var(core_state).min_freq_khz=0;
+ get_cpu_var(core_state).max_freq_khz=0;
+ get_cpu_var(core_state).cur_freq_khz=0;
} else {
- get_cpu_var(core_state).have_cpufreq = 1;
- get_cpu_var(core_state).min_freq_khz=p->min;
- get_cpu_var(core_state).max_freq_khz=p->max;
- get_cpu_var(core_state).cur_freq_khz=p->cur;
- cpufreq_cpu_put(p);
- }
-
+ get_cpu_var(core_state).have_cpufreq = 1;
+ get_cpu_var(core_state).min_freq_khz=p->min;
+ get_cpu_var(core_state).max_freq_khz=p->max;
+ get_cpu_var(core_state).cur_freq_khz=p->cur; } cpufreq_cpu_put(p);
put_cpu_var(core_state);
-
+
+ /*
+ for (i=0;i<get_cpu_var(processors)->performance->state_count; i++) {
+ INFO("P-State: %u: freq=%llu ctrl=%llx",
+ i,
+ get_cpu_var(processors)->performance->states[i].core_frequency*1000,
+ get_cpu_var(processors)->performance->states[i].control);
+ }
+ put_cpu_var(processors);
+ */
}
{
DEBUG("P-State Core Deinit\n");
palacios_pstate_ctrl_release();
+
}
void palacios_pstate_ctrl_get_chars(struct v3_cpu_pstate_chars *c)
{
memset(c,0,sizeof(struct v3_cpu_pstate_chars));
-
+
c->features = V3_PSTATE_INTERNAL_CONTROL;
if (get_cpu_var(core_state).have_cpufreq) {
- c->features |= V3_PSTATE_EXTERNAL_CONTROL;
+ c->features |= V3_PSTATE_EXTERNAL_CONTROL;
}
if (machine_state.arch==AMD || machine_state.arch==INTEL) {
- c->features |= V3_PSTATE_DIRECT_CONTROL;
+ c->features |= V3_PSTATE_DIRECT_CONTROL;
}
c->cur_mode = get_cpu_var(core_state).mode;
c->min_pstate = get_cpu_var(core_state).min_pstate;
put_cpu_var(core_state);
-
-
+
+
}
-uint8_t palacios_pstate_ctrl_get_pstate(void)
+uint64_t palacios_pstate_ctrl_get_pstate(void)
{
if (get_cpu_var(core_state).mode==V3_PSTATE_DIRECT_CONTROL) {
- put_cpu_var(core_state);
- return machine_state.funcs->get_pstate();
+ put_cpu_var(core_state);
+ return machine_state.funcs->get_pstate();
+ } else if (get_cpu_var(core_state).mode==V3_PSTATE_EXTERNAL_CONTROL) {
+ put_cpu_var(core_state);
+ return linux_get_pstate();
} else {
- put_cpu_var(core_state);
- return 0;
+ put_cpu_var(core_state);
+ return 0;
}
}
-void palacios_pstate_ctrl_set_pstate(uint8_t p)
+
+void palacios_pstate_ctrl_set_pstate(uint64_t p)
{
if (get_cpu_var(core_state).mode==V3_PSTATE_DIRECT_CONTROL) {
- put_cpu_var(core_state);
- machine_state.funcs->set_pstate(p);
- }
+ put_cpu_var(core_state);
+ machine_state.funcs->set_pstate(p);
+ } else if (get_cpu_var(core_state).mode==V3_PSTATE_EXTERNAL_CONTROL) {
+ put_cpu_var(core_state);
+ linux_set_pstate(p);
+ } else {
+ put_cpu_var(core_state);
+ }
}
palacios_pstate_ctrl_set_pstate((uint8_t)(uint64_t)p);
}
+
uint64_t palacios_pstate_ctrl_get_freq(void)
{
if (get_cpu_var(core_state).mode==V3_PSTATE_EXTERNAL_CONTROL) {
- put_cpu_var(core_state);
- ERROR("Unimplemented get freq\n");
- return 0;
+ put_cpu_var(core_state);
+ return linux_get_freq();
} else {
- put_cpu_var(core_state);
- return 0;
+ put_cpu_var(core_state);
+ return 0;
}
}
+
void palacios_pstate_ctrl_set_freq(uint64_t p)
{
if (get_cpu_var(core_state).mode==V3_PSTATE_EXTERNAL_CONTROL) {
- put_cpu_var(core_state);
- ERROR("Unimplemented set freq\n");
- }
- put_cpu_var(core_state);
-
+ put_cpu_var(core_state);
+ linux_set_freq(p);
+ } else {
+ put_cpu_var(core_state);
+ }
}
-static void switch_to_external(void)
+static int switch_to_external(void)
{
+ DEBUG("switch from host control to external\n");
+
if (!(get_cpu_var(core_state).have_cpufreq)) {
- put_cpu_var(core_state);
- ERROR("No cpufreq - cannot switch to external...\n");
- return;
- }
+ put_cpu_var(core_state);
+ ERROR("No cpufreq - cannot switch to external...\n");
+ return -1;
+ }
put_cpu_var(core_state);
- ERROR("Unimplemented switch to external...\n");
+ linux_setup_palacios_governor();
+
+ get_cpu_var(core_state).mode=V3_PSTATE_EXTERNAL_CONTROL;
+ put_cpu_var(core_state);
+
+ return 0;
}
-
-static void switch_to_direct(void)
+
+
+static int switch_to_direct(void)
{
+ DEBUG("switch from host control to direct\n");
+
if (get_cpu_var(core_state).have_cpufreq) {
- put_cpu_var(core_state);
- ERROR("Unimplemented: switch to direct on machine with cpu freq\n");
- // The implementation would set the policy and governor to peg cpu
- // regardless of load
+ put_cpu_var(core_state);
+ DEBUG("switch to direct from cpufreq\n");
+
+ // The implementation would set the policy and governor to peg cpu
+ // regardless of load
+ linux_setup_palacios_governor();
+ } else {
+ put_cpu_var(core_state);
}
if (machine_state.funcs && machine_state.funcs->arch_init) {
- get_cpu_var(core_state).mode=V3_PSTATE_DIRECT_CONTROL;
-
- machine_state.funcs->arch_init();
+ get_cpu_var(core_state).mode=V3_PSTATE_DIRECT_CONTROL;
+
+ machine_state.funcs->arch_init();
- put_cpu_var(core_state);
+ put_cpu_var(core_state);
}
+ return 0;
}
-
-static void switch_to_internal(void)
+
+static int switch_to_internal(void)
{
+ DEBUG("switch from host control to internal\n");
+
if (get_cpu_var(core_state).have_cpufreq) {
- put_cpu_var(core_state);
- ERROR("Unimplemented: switch to internal on machine with cpu freq\n");
- return;
- // The implementation would set the policy and governor to peg cpu
- // regardless of load - exactly like direct
+ put_cpu_var(core_state);
+ DEBUG("switch to internal on machine with cpu freq\n");
+ linux_setup_palacios_governor();
+ } else {
+ put_cpu_var(core_state);
}
get_cpu_var(core_state).mode=V3_PSTATE_INTERNAL_CONTROL;
-
+
put_cpu_var(core_state);
- return;
+ return 0;
}
-static void switch_from_external(void)
+static int switch_from_external(void)
{
if (!(get_cpu_var(core_state).have_cpufreq)) {
- put_cpu_var(core_state);
- ERROR("No cpufreq - how did we get here... external...\n");
- return;
+ put_cpu_var(core_state);
+ ERROR("No cpufreq - how did we get here... external...\n");
+ return -1;
}
+ put_cpu_var(core_state);
- ERROR("Unimplemented switch from external...\n");
-
- get_cpu_var(core_state).mode = V3_PSTATE_HOST_CONTROL;
+ DEBUG("Switching back to host control from external\n");
+ if (get_cpu_var(core_state).have_cpufreq) {
+ put_cpu_var(core_state);
+ linux_restore_defaults();
+ } else {
+ put_cpu_var(core_state);
+ }
+
+ get_cpu_var(core_state).mode = V3_PSTATE_HOST_CONTROL;
put_cpu_var(core_state);
+ return 0;
}
-
-static void switch_from_direct(void)
-{
-
- if (get_cpu_var(core_state).have_cpufreq) {
- put_cpu_var(core_state);
- ERROR("Unimplemented: switch from direct on machine with cpu freq - will just pretend to do so\n");
- // The implementation would switch back to default policy and governor
- }
- get_cpu_var(core_state).mode=V3_PSTATE_HOST_CONTROL;
+static int switch_from_direct(void)
+{
- machine_state.funcs->set_pstate(get_cpu_var(core_state).min_pstate);
+ DEBUG("Switching back to host control from direct\n");
+ // Set maximum performance, just in case there is no host control
+ machine_state.funcs->set_pstate(get_cpu_var(core_state).min_pstate);
machine_state.funcs->arch_deinit();
+ if (get_cpu_var(core_state).have_cpufreq) {
+ put_cpu_var(core_state);
+ linux_restore_defaults();
+ } else {
+ put_cpu_var(core_state);
+ }
+
+ get_cpu_var(core_state).mode=V3_PSTATE_HOST_CONTROL;
+
put_cpu_var(core_state);
+
+ return 0;
}
-
-static void switch_from_internal(void)
+
+static int switch_from_internal(void)
{
+ DEBUG("Switching back to host control from internal\n");
+
if (get_cpu_var(core_state).have_cpufreq) {
- put_cpu_var(core_state);
- ERROR("Unimplemented: switch from internal on machine with cpu freq - will just pretend to do so\n");
- // The implementation would switch back to default policy and governor
+ put_cpu_var(core_state);
+ linux_restore_defaults();
+ } else {
+ put_cpu_var(core_state);
}
get_cpu_var(core_state).mode=V3_PSTATE_HOST_CONTROL;
put_cpu_var(core_state);
-
- return;
+
+ return 0;
}
void palacios_pstate_ctrl_acquire(uint32_t type)
{
if (get_cpu_var(core_state).mode != V3_PSTATE_HOST_CONTROL) {
- palacios_pstate_ctrl_release();
+ put_cpu_var(core_state);
+ palacios_pstate_ctrl_release();
+ } else {
+ put_cpu_var(core_state);
}
- put_cpu_var(core_state);
-
switch (type) {
- case V3_PSTATE_EXTERNAL_CONTROL:
- switch_to_external();
- break;
- case V3_PSTATE_DIRECT_CONTROL:
- switch_to_direct();
- break;
- case V3_PSTATE_INTERNAL_CONTROL:
- switch_to_internal();
- break;
- default:
- ERROR("Unknown pstate control type %u\n",type);
- break;
+ case V3_PSTATE_EXTERNAL_CONTROL:
+ switch_to_external();
+ break;
+ case V3_PSTATE_DIRECT_CONTROL:
+ switch_to_direct();
+ break;
+ case V3_PSTATE_INTERNAL_CONTROL:
+ switch_to_internal();
+ break;
+ default:
+ ERROR("Unknown pstate control type %u\n",type);
+ break;
}
}
void palacios_pstate_ctrl_release(void)
{
-
if (get_cpu_var(core_state).mode == V3_PSTATE_HOST_CONTROL) {
- put_cpu_var(core_state);
- return;
- }
+ put_cpu_var(core_state);
+ return;
+ }
+ put_cpu_var(core_state);
switch (get_cpu_var(core_state).mode) {
- case V3_PSTATE_EXTERNAL_CONTROL:
- switch_from_external();
- break;
- case V3_PSTATE_DIRECT_CONTROL:
- switch_from_direct();
- break;
- case V3_PSTATE_INTERNAL_CONTROL:
- switch_from_internal();
- break;
- default:
- ERROR("Unknown pstate control type %u\n",core_state.mode);
- break;
+ case V3_PSTATE_EXTERNAL_CONTROL:
+ put_cpu_var(core_state);
+ switch_from_external();
+ break;
+ case V3_PSTATE_DIRECT_CONTROL:
+ put_cpu_var(core_state);
+ switch_from_direct();
+ break;
+ case V3_PSTATE_INTERNAL_CONTROL:
+ put_cpu_var(core_state);
+ switch_from_internal();
+ break;
+ default:
+ put_cpu_var(core_state);
+ ERROR("Unknown pstate control type %u\n",core_state.mode);
+ break;
}
-
- put_cpu_var(core_state);
-
}
static void update_hw_pstate(void *arg)
{
if (machine_state.funcs && machine_state.funcs->get_pstate) {
- get_cpu_var(core_state).cur_hw_pstate = machine_state.funcs->get_pstate();
- put_cpu_var(core_state);
+ get_cpu_var(core_state).cur_hw_pstate = machine_state.funcs->get_pstate();
+ put_cpu_var(core_state);
} else {
- get_cpu_var(core_state).cur_hw_pstate = 0;
- put_cpu_var(core_state);
+ get_cpu_var(core_state).cur_hw_pstate = 0;
+ put_cpu_var(core_state);
}
}
/***************************************************************************
PROC Interface to expose state
-***************************************************************************/
+ ***************************************************************************/
static int pstate_show(struct seq_file * file, void * v)
{
seq_printf(file, "V3VEE DVFS Status\n\n");
for (cpu=0;cpu<numcpus;cpu++) {
- palacios_xcall(cpu,update_hw_pstate,0);
+ palacios_xcall(cpu,update_hw_pstate,0);
}
-
- seq_printf(file, "Arch:\t%s\nPStates:\t%s\n\n",
- machine_state.arch==INTEL ? "Intel" :
- machine_state.arch==AMD ? "AMD" : "Other",
- machine_state.supports_pstates ? "Yes" : "No");
-
+
for (cpu=0;cpu<numcpus;cpu++) {
- struct pstate_core_info *s = &per_cpu(core_state,cpu);
- seq_printf(file,"pcore %u: hw pstate %u mode %s of [ host ",cpu,
- s->cur_hw_pstate,
- s->mode==V3_PSTATE_HOST_CONTROL ? "host" :
- s->mode==V3_PSTATE_EXTERNAL_CONTROL ? "external" :
- s->mode==V3_PSTATE_DIRECT_CONTROL ? "direct" :
- s->mode==V3_PSTATE_INTERNAL_CONTROL ? "internal" : "UNKNOWN");
- if (s->have_cpufreq) {
- seq_printf(file," external ");
- }
- if (machine_state.arch==AMD || machine_state.arch==INTEL) {
- seq_printf(file,"direct ");
- }
- seq_printf(file,"internal ] ");
- if (s->mode==V3_PSTATE_EXTERNAL_CONTROL) {
- seq_printf(file,"(min=%llu max=%llu cur=%llu) ", s->min_freq_khz, s->max_freq_khz, s->cur_freq_khz);
- }
- if (s->mode==V3_PSTATE_DIRECT_CONTROL) {
- seq_printf(file,"(min=%u max=%u cur=%u) ", (uint32_t)s->min_pstate, (uint32_t)s->max_pstate, (uint32_t)s->cur_pstate);
- }
- seq_printf(file,"\n");
+ struct pstate_core_info *s = &per_cpu(core_state,cpu);
+ seq_printf(file,"pcore %u: hw pstate 0x%llx mode %s ",cpu,
+ s->cur_hw_pstate,
+ s->mode==V3_PSTATE_HOST_CONTROL ? "host" :
+ s->mode==V3_PSTATE_EXTERNAL_CONTROL ? "external" :
+ s->mode==V3_PSTATE_DIRECT_CONTROL ? "direct" :
+ s->mode==V3_PSTATE_INTERNAL_CONTROL ? "internal" : "UNKNOWN");
+ if (s->mode==V3_PSTATE_EXTERNAL_CONTROL) {
+ seq_printf(file,"(min=%llu max=%llu cur=%llu) ", s->min_freq_khz, s->max_freq_khz, s->cur_freq_khz);
+ }
+ if (s->mode==V3_PSTATE_DIRECT_CONTROL) {
+ seq_printf(file,"(min=%llu max=%llu cur=%llu) ",s->min_pstate, s->max_pstate, s->cur_pstate);
+ }
+ seq_printf(file,"\n");
}
return 0;
}
.release = seq_release
};
+static int pstate_hw_show(struct seq_file * file, void * v)
+{
+ int numstates;
+
+ seq_printf(file, "V3VEE DVFS Hardware Info\n(all logical cores assumed identical)\n\n");
+
+ seq_printf(file, "Arch: \t%s\n"
+ "PStates:\t%s\n\n",
+ machine_state.arch==INTEL ? "Intel" :
+ machine_state.arch==AMD ? "AMD" : "Other",
+ machine_state.supports_pstates ? "Yes" : "No");
+
+
+#define YN(x) ((x) ? "Y" : "N")
+
+ if (machine_state.arch==INTEL) {
+ seq_printf(file,"SpeedStep: \t%s\n",YN(machine_state.have_speedstep));
+ seq_printf(file,"APERF/MPERF: \t%s\n",YN(machine_state.have_pstate_hw_coord));
+ seq_printf(file,"IDA or TurboCore: \t%s\n",YN(machine_state.have_opportunistic));
+ seq_printf(file,"Policy Hint: \t%s\n",YN(machine_state.have_policy_hint));
+ seq_printf(file,"Hardware Policy: \t%s\n",YN(machine_state.have_hwp));
+ seq_printf(file,"Hardware Duty Cycle: \t%s\n",YN(machine_state.have_hdc));
+ seq_printf(file,"MWAIT extensions: \t%s\n",YN(machine_state.have_mwait_ext));
+ seq_printf(file,"MWAIT wake on intr: \t%s\n",YN(machine_state.have_mwait_int));
+ }
+
+ if (machine_state.arch==AMD) {
+ seq_printf(file,"PState: \t%s\n",YN(machine_state.have_pstate));
+ seq_printf(file,"APERF/MPERF: \t%s\n",YN(machine_state.have_pstate_hw_coord));
+ seq_printf(file,"CoreBoost: \t%s\n",YN(machine_state.have_coreboost));
+ seq_printf(file,"Feedback: \t%s\n",YN(machine_state.have_feedback));
+ }
+
+
+ seq_printf(file,"\nPstate\tCtrl\tKHz\tmW\tuS(X)\tuS(B)\n");
+ numstates = get_cpu_var(processors)->performance->state_count;
+ if (!numstates) {
+ seq_printf(file,"UNKNOWN\n");
+ } else {
+ int i;
+ for (i=0;i<numstates;i++) {
+ seq_printf(file,
+ "%u\t%llx\t%llu\t%llu\t%llu\t%llu\n",
+ i,
+ get_cpu_var(processors)->performance->states[i].control,
+ get_cpu_var(processors)->performance->states[i].core_frequency*1000,
+ get_cpu_var(processors)->performance->states[i].power,
+ get_cpu_var(processors)->performance->states[i].transition_latency,
+ get_cpu_var(processors)->performance->states[i].bus_master_latency);
+ }
+ }
+ put_cpu_var(processors);
+
+ seq_printf(file,"\nAvailable Modes:");
+ seq_printf(file," host");
+ if (get_cpu_var(core_state).have_cpufreq) {
+ seq_printf(file," external");
+ }
+ put_cpu_var(core_state);
+ if (machine_state.supports_pstates) {
+ seq_printf(file," direct");
+ }
+ seq_printf(file," internal\n");
+
+ return 0;
+}
+
+static int pstate_hw_open(struct inode * inode, struct file * file)
+{
+ return single_open(file, pstate_hw_show, NULL);
+}
+
+
+static struct file_operations pstate_hw_fops = {
+ .owner = THIS_MODULE,
+ .open = pstate_hw_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release
+};
+
+
int pstate_proc_setup(void)
{
struct proc_dir_entry *proc;
-
- proc = create_proc_entry("v3-dvfs",0444, palacios_get_procdir());
+ struct proc_dir_entry *prochw;
+
+ PAL_PROC_CREATE(proc,"v3-dvfs",0444,palacios_get_procdir(),&pstate_fops);
if (!proc) {
- ERROR("Failed to create proc entry for p-state control\n");
- return -1;
+ ERROR("Failed to create proc entry for p-state control\n");
+ return -1;
}
-
- proc->proc_fops = &pstate_fops;
-
+
+ INFO("/proc/v3vee/v3-dvfs successfully created\n");
+
+ PAL_PROC_CREATE(prochw,"v3-dvfs-hw",0444,palacios_get_procdir(),&pstate_hw_fops);
+
+ if (!prochw) {
+ ERROR("Failed to create proc entry for p-state hw info\n");
+ return -1;
+ }
+
+ INFO("/proc/v3vee/v3-dvfs-hw successfully created\n");
+
return 0;
}
-
+
void pstate_proc_teardown(void)
{
+ remove_proc_entry("v3-dvfs-hw",palacios_get_procdir());
remove_proc_entry("v3-dvfs",palacios_get_procdir());
}
/********************************************************************
User interface (ioctls)
-********************************************************************/
+ ********************************************************************/
static int dvfs_ctrl(unsigned int cmd, unsigned long arg)
{
struct v3_dvfs_ctrl_request r;
if (copy_from_user(&r,(void __user*)arg,sizeof(struct v3_dvfs_ctrl_request))) {
- ERROR("Failed to copy DVFS request from user\n");
- return -EFAULT;
+ ERROR("Failed to copy DVFS request from user\n");
+ return -EFAULT;
}
if (r.pcore >= num_online_cpus()) {
- ERROR("Cannot apply DVFS request to pcore %u\n",r.pcore);
- return -EFAULT;
+ ERROR("Cannot apply DVFS request to pcore %u\n",r.pcore);
+ return -EFAULT;
}
switch (r.cmd) {
- case V3_DVFS_ACQUIRE: {
- switch (r.acq_type) {
- case V3_DVFS_EXTERNAL:
- palacios_xcall(r.pcore,(void (*)(void*))palacios_pstate_ctrl_acquire_external,0);
- return 0;
- break;
- case V3_DVFS_DIRECT:
- palacios_xcall(r.pcore,(void (*)(void*))palacios_pstate_ctrl_acquire_direct,0);
- return 0;
- break;
- default:
- ERROR("Unknown DVFS acquire type %u\n",r.acq_type);
- return -EFAULT;
- }
- }
- break;
- case V3_DVFS_RELEASE: {
- palacios_xcall(r.pcore,(void (*)(void*))palacios_pstate_ctrl_release,0);
- return 0;
- }
- break;
- case V3_DVFS_SETFREQ: {
- palacios_xcall(r.pcore,(void (*)(void*))palacios_pstate_ctrl_set_freq,(void*)r.freq_khz);
- return 0;
- }
- break;
- case V3_DVFS_SETPSTATE: {
- palacios_xcall(r.pcore,(void (*)(void*))palacios_pstate_ctrl_set_pstate_wrapper,(void*)(uint64_t)r.pstate);
- return 0;
- }
- default: {
- ERROR("Unknown DVFS command %u\n",r.cmd);
- return -EFAULT;
- }
- break;
+ case V3_DVFS_ACQUIRE: {
+ switch (r.acq_type) {
+ case V3_DVFS_EXTERNAL:
+ palacios_xcall(r.pcore,(void (*)(void*))palacios_pstate_ctrl_acquire_external, NULL);
+ return 0;
+ break;
+ case V3_DVFS_DIRECT:
+ palacios_xcall(r.pcore,(void (*)(void*))palacios_pstate_ctrl_acquire_direct, NULL);
+ return 0;
+ break;
+ default:
+ ERROR("Unknown DVFS acquire type %u\n",r.acq_type);
+ return -EFAULT;
+ }
+ }
+ break;
+ case V3_DVFS_RELEASE: {
+ palacios_xcall(r.pcore,(void (*)(void*))palacios_pstate_ctrl_release, NULL);
+ return 0;
+ }
+ break;
+ case V3_DVFS_SETFREQ: {
+ palacios_xcall(r.pcore,(void (*)(void*))palacios_pstate_ctrl_set_freq,(void*)r.freq_khz);
+ return 0;
+ }
+ break;
+ case V3_DVFS_SETPSTATE: {
+ palacios_xcall(r.pcore,(void (*)(void*))palacios_pstate_ctrl_set_pstate_wrapper,(void*)(uint64_t)r.pstate);
+ return 0;
+ }
+ default: {
+ ERROR("Unknown DVFS command %u\n",r.cmd);
+ return -EFAULT;
+ }
+ break;
}
}
};
-
+
static int pstate_ctrl_init(void)
{
unsigned int cpu;
pstate_arch_setup();
for (cpu=0;cpu<numcpus;cpu++) {
- palacios_xcall(cpu,(void ((*)(void*)))init_core,0);
+ palacios_xcall(cpu,(void ((*)(void*)))init_core,0);
}
V3_Init_Pstate_Ctrl(&hooks);
if (pstate_proc_setup()) {
- ERROR("Unable to initialize P-State Control\n");
- return -1;
+ ERROR("Unable to initialize P-State Control\n");
+ return -1;
}
pstate_user_setup();
+ pstate_linux_init();
+
INFO("P-State Control Initialized\n");
return 0;
unsigned int cpu;
unsigned int numcpus=num_online_cpus();
+ pstate_linux_deinit();
pstate_user_teardown();
// release pstate control if we have it, and we need to do this on each processor
for (cpu=0;cpu<numcpus;cpu++) {
- palacios_xcall(cpu,(void (*)(void *))deinit_core,0);
+ palacios_xcall(cpu,(void (*)(void *))deinit_core,0);
+ }
+
+
+ // Free any mapping table we built for Intel
+ if (intel_pstate_to_ctrl && intel_pstate_to_ctrl != intel_pstate_to_ctrl_internal) {
+ palacios_free(intel_pstate_to_ctrl);
}
+
return 0;
}
-
+
static struct linux_ext pstate_ext = {
.name = "PSTATE_CTRL",