Palacios Public Git Repository

To checkout Palacios execute

  git clone http://v3vee.org/palacios/palacios.web/palacios.git
This will give you the master branch. You probably want the devel branch or one of the release branches. To switch to the devel branch, simply execute
  cd palacios
  git checkout --track -b devel origin/devel
The other branches are similar.


P-State: add notifier block for linux frequency transitions, bug fixes
[palacios.git] / linux_module / iface-pstate-ctrl.c
1 /*
2  * This file is part of the Palacios Virtual Machine Monitor developed
3  * by the V3VEE Project with funding from the United States National 
4  * Science Foundation and the Department of Energy.  
5  *
6  * The V3VEE Project is a joint project between Northwestern University
7  * and the University of New Mexico.  You can find out more at 
8  * http://www.v3vee.org
9  *
10  * Copyright (c) 2014, the V3VEE Project <http://www.v3vee.org>
11  * all rights reserved.
12  *
13  * Author: Kyle C. Hale <kh@u.northwestern.edu>
14  *         Shiva Rao <shiva.rao.717@gmail.com>
15  *         Peter Dinda <pdinda@northwestern.edu>
16  *
17  * This is free software.  you are permitted to use,
18  * redistribute, and modify it as specified in the file "V3VEE_LICENSE".
19  */
20
21 #include <linux/uaccess.h>
22 #include <linux/seq_file.h>
23 #include <linux/proc_fs.h>
24 #include <linux/cpufreq.h>
25 #include <linux/kernel.h>
26 #include <linux/kmod.h>
27 #include <linux/module.h>
28 #include <linux/string.h>
29 #include <linux/interrupt.h>
30 #include <asm/processor.h>
31 #include <asm/msr.h>
32 #include <asm/msr-index.h>
33
34 // Used to determine the appropriate pstates values on Intel
35 #include <linux/acpi.h>
36 #include <acpi/processor.h>
37
38 #include <interfaces/vmm_pstate_ctrl.h>
39
40 #include "palacios.h"
41 #include "iface-pstate-ctrl.h"
42
43 #include "linux-exts.h"
44
45 /*
46    This P-STATE control implementation includes:
47
48    - Direct control of Intel and AMD processor pstates
49    - External control of processor states via Linux (unimplemented)
50    - Internal control of processor states in Palacios (handoff from Linux)
51
52    Additionally, it provides a user-space interface for manipulating
53    p-state regardless of the host's functionality.  This includes
54    an ioctl for commanding the implementation and a /proc file for 
55    showing current status and capabilities.
56
57    What we mean by "pstate" here is the processor's internal
58    configuration.   For AMD, this is defined as being the same as
59    the ACPI-defined p-state.  For Intel, it is not.  There, it is the 
60    contents of the perf ctl MSR, which, often, is the frequency id 
61    and voltage id (the multipliers).
62
63 */
64
65
66 #define PALACIOS_GOVNAME "v3vee"
67 #define MAX_PATH_LEN     128
68 #define MAX_GOV_NAME_LEN 16
69
70
71 struct pstate_core_info {
72     // Here we have the notion of host control
73 #define V3_PSTATE_HOST_CONTROL 0
74     // and all the modes from the Palacios interface:
75     // V3_PSTATE_EXTERNAL_CONTROL
76     // V3_PSTATE_DIRECT_CONTROL
77     // V3_PSTATE_INTERNAL_CONTROL
78     uint32_t mode;
79
80     // Apply if we are under the DIRECT state
81     uint8_t cur_pstate;
82     uint8_t max_pstate;
83     uint8_t min_pstate;
84
85     uint8_t cur_hw_pstate;
86
87     // Apply if we are under the EXTERNAL state
88     uint64_t set_freq_khz; // this is the frequency we're hoping to get
89     uint64_t cur_freq_khz;
90     uint64_t max_freq_khz;
91     uint64_t min_freq_khz;
92
93     // Intel-specific
94     uint8_t prior_speedstep;
95     uint8_t turbo_disabled;
96     uint8_t no_turbo;
97
98     int have_cpufreq;
99
100     // This is where we stash Linux's governor when we make a mode switch
101     char * linux_governor;
102     // We have this so we can restore the original frequency when we started
103     uint64_t original_hz; 
104
105 };
106
107
108 static DEFINE_PER_CPU(struct pstate_core_info, core_state);
109
110
111
112 // These are used to assert DIRECT control over the core pstates
113 struct pstate_core_funcs {
114     void    (*arch_init)(void);
115     void    (*arch_deinit)(void);
116     uint64_t (*get_min_pstate)(void);
117     uint64_t (*get_max_pstate)(void);
118     uint64_t (*get_pstate)(void);
119     void    (*set_pstate)(uint64_t pstate);
120 };
121
122 struct pstate_machine_info {
123     enum {INTEL, AMD, OTHER } arch;
124     int supports_pstates;
125
126
127     // For AMD
128     int have_pstate;
129     int have_coreboost;
130     int have_feedback;  
131
132     // For Intel
133     int have_speedstep;
134     int have_opportunistic; // this means "Turbo Boost" or "IDA"
135     int have_policy_hint;
136     int have_hwp;       // hardware-controlled performance states
137     int have_hdc;       // hardware duty cycling
138     int have_mwait_ext; // mwait power extensions
139     int have_mwait_int; // mwait wakes on interrupt
140
141     // for both
142     int have_pstate_hw_coord;  // mperf/aperf
143
144     // used for DIRECT control
145     struct pstate_core_funcs *funcs;
146
147 };
148
149 static struct pstate_machine_info machine_state;
150
151
152 /****************************************************
153   AMD  DIRECT CONTROL
154  ***************************************************/
155
156 /* AMD Programmer's Manual Vol 2 (Rev 3, 2013), Sec. 17.1, pp.557 */
157 #define MSR_PSTATE_LIMIT_REG_AMD 0xc0010061
158 #define MSR_PSTATE_CTL_REG_AMD   0xc0010062
159 #define MSR_PSTATE_STAT_REG_AMD  0xc0010063
160
161 struct p_state_limit_reg_amd {
162     union {
163         uint64_t val;
164         struct {
165             uint8_t  pstate_limit : 4; /* lowest P-state value (highest perf.) supported currently (this can change at runtime) */
166             uint8_t  pstate_max   : 4; /* highest P-state value supported  (lowest perf) */
167             uint64_t rsvd         : 56;
168         } reg;
169     } __attribute__((packed));
170 } __attribute__((packed));
171
172
173 struct p_state_stat_reg_amd {
174     union {
175         uint64_t val;
176         struct {
177             uint8_t  pstate  : 4;
178             uint64_t rsvd    : 60;
179         } reg;
180     } __attribute__((packed));
181 } __attribute__((packed));
182
183
184 struct p_state_ctl_reg_amd {
185     union {
186         uint64_t val;
187         struct {
188             uint8_t  cmd  : 4;
189             uint64_t rsvd : 60;
190         } reg;
191     } __attribute__((packed));
192 } __attribute__((packed));
193
194
195 /* CPUID Fn8000_0007_EDX[HwPstate(7)] = 1 */
196 static uint8_t supports_pstates_amd (void)
197 {
198     uint32_t eax, ebx, ecx, edx;
199
200     cpuid(0x80000007, &eax, &ebx, &ecx, &edx);
201     machine_state.have_pstate = !!(edx & (1 << 7));
202     machine_state.have_coreboost = !!(edx & (1<<9));
203     machine_state.have_feedback = !!(edx & (1<<11));
204
205     cpuid(0x6, &eax, &ebx, &ecx, &edx);
206     machine_state.have_pstate_hw_coord =  !!(ecx & 1); 
207
208     INFO("P-State: AMD: Pstates=%d Coreboost=%d Feedback=%d PstateHWCoord=%d\n",
209             machine_state.have_pstate, 
210             machine_state.have_coreboost, 
211             machine_state.have_feedback,
212             machine_state.have_pstate_hw_coord);
213
214     return machine_state.have_pstate;
215
216
217 }
218
219
220 static void init_arch_amd(void)
221 {
222     /* KCH: nothing to do here */
223 }
224
225
226 static void deinit_arch_amd(void)
227 {
228     /* KCH: nothing to do here */
229 }
230
231
232 static uint64_t get_pstate_amd(void) 
233 {
234     struct p_state_stat_reg_amd pstat;
235
236     rdmsrl(MSR_PSTATE_STAT_REG_AMD, pstat.val);
237
238     get_cpu_var(core_state).cur_pstate=pstat.reg.pstate;
239     put_cpu_var(core_state);
240
241     return pstat.reg.pstate;
242 }
243
244
245 static void set_pstate_amd(uint64_t p)
246 {
247     struct p_state_ctl_reg_amd pctl;
248     pctl.val = 0;
249     pctl.reg.cmd = p;
250
251     wrmsrl(MSR_PSTATE_CTL_REG_AMD, pctl.val);
252
253     get_cpu_var(core_state).cur_pstate=p;
254     put_cpu_var(core_state);
255 }
256
257
258 /*
259  * NOTE: HW may change this value at runtime
260  */
261 static uint64_t get_max_pstate_amd(void)
262 {
263     struct p_state_limit_reg_amd plimits;
264
265     rdmsrl(MSR_PSTATE_LIMIT_REG_AMD, plimits.val);
266
267     return plimits.reg.pstate_max;
268 }
269
270
271 static uint64_t get_min_pstate_amd(void)
272 {
273     struct p_state_limit_reg_amd plimits;
274
275     rdmsrl(MSR_PSTATE_LIMIT_REG_AMD, plimits.val);
276
277     return plimits.reg.pstate_limit;
278 }
279
280
281 static struct pstate_core_funcs amd_funcs =
282 {
283     .arch_init        = init_arch_amd,
284     .arch_deinit      = deinit_arch_amd,
285     .get_pstate       = get_pstate_amd,
286     .set_pstate       = set_pstate_amd,
287     .get_max_pstate   = get_max_pstate_amd,
288     .get_min_pstate   = get_min_pstate_amd,
289 };
290
291
292
293 /***********************************************************
294   INTEL DIRECT CONTROL
295  **********************************************************/
296
297
298 /*
299    This implementation uses SpeedStep, but does check
300    to see if the other features (MPERF/APERF, Turbo/IDA, HWP)
301    are available.
302    */
303
304 /* Intel System Programmer's Manual Vol. 3B, 14-2 */
305 #define MSR_MPERF_IA32         0x000000e7
306 #define MSR_APERF_IA32         0x000000e8
307 #define MSR_MISC_ENABLE_IA32   0x000001a0
308 #define MSR_NHM_TURBO_RATIO_LIMIT   0x000001ad
309 #define MSR_PLATFORM_INFO_IA32 0x000000ce
310 #define MSR_PERF_CTL_IA32      0x00000199
311 #define MSR_PERF_STAT_IA32     0x00000198
312 #define MSR_ENERY_PERF_BIAS_IA32 0x000001b0
313
314
315 /* Note that the actual  meaning of the pstate
316    in the control and status registers is actually
317    implementation dependent, unlike AMD.   The "official"
318    way to figure it out the mapping from pstate to 
319    these values is via ACPI.  What is written in the register
320    is an "id" of an operation point
321
322    "Often", the 16 bit field consists of a high order byte
323    which is the frequency (the multiplier) and the low order
324    byte is the voltage. 
325    */
326 // MSR_PERF_CTL_IA32  r/w
327 struct perf_ctl_reg_intel {
328     union {
329         uint64_t val;
330         struct {
331             // This is the target
332             // Note, not the ACPI pstate, but
333             // Intel's notion of pstate is that it's opaque
334             // for lots of implementations it seems to be
335             // frequency_id : voltage_id
336             // where frequency_id is typically the multiplier
337             uint16_t pstate                 : 16;
338             uint16_t reserved               : 16;
339             // set to 1 to *disengage* dynamic acceleration
340             // Note that "IDA" and "Turbo" use the same interface
341             uint16_t dynamic_accel_disable  : 1;
342             uint32_t reserved2              : 31;
343         } reg;
344     } __attribute__((packed));
345 } __attribute__((packed));
346
347 // MSR_PERF_STAT_IA32 r
348 struct perf_stat_reg_intel {
349     union {
350         uint64_t val;
351         struct {
352             // this is the current
353             uint16_t pstate                 : 16;
354             uint64_t reserved               : 48;
355         } reg;
356     } __attribute__((packed));
357 } __attribute__((packed));
358
359 // MSR_ENERGY_PERF_BIAS_IA32 r/w
360 struct enery_perf_bias_reg_intel {
361     union {
362         uint64_t val;
363         struct {
364             // this is the current
365             uint8_t  policy_hint            : 4;
366             uint64_t reserved               : 60;
367         } reg;
368     } __attribute__((packed));
369 } __attribute__((packed));
370
371 // MSR_PLATFORM_INFO
372 struct turbo_mode_info_reg_intel {
373     union {
374         uint64_t val;
375         struct {
376             uint8_t  rsvd0                  : 8;
377             uint8_t  max_noturbo_ratio      : 8;
378             uint8_t  rsvd1                  : 7;
379             uint8_t  ppin_cap               : 1;
380             uint8_t  rsvd2                  : 4;
381             uint8_t  ratio_limit            : 1; 
382             uint8_t  tdc_tdp_limit          : 1;
383             uint16_t rsvd3                  : 10;
384             uint8_t  min_ratio              : 8;
385             uint16_t rsvd4                  : 16;
386         } reg;
387     } __attribute__((packed));
388 } __attribute__((packed));
389
390 // This replicates the critical information in Linux's struct acpi_processor_px
391 // To make it easier to port to other OSes.    
392 struct intel_pstate_info {
393     uint64_t freq;  // KHz
394     uint64_t ctrl;  // What to write into the _CTL MSR to get this
395 };
396
397 // The internal array will be used if we cannot build the table locally
398 static struct intel_pstate_info *intel_pstate_to_ctrl_internal=0;
399 static int intel_num_pstates_internal=0;
400
401 // These will either point to the internal array or to a constructed array
402 static struct intel_pstate_info *intel_pstate_to_ctrl=0;
403 static int intel_num_pstates=0;
404
405
406 /* CPUID.01:ECX.AES(7) */
407 static uint8_t supports_pstates_intel(void)
408 {
409     /* NOTE: CPUID.06H:ECX.SETBH[bit 3] is set and it also implies the presence of a new architectural MSR called IA32_ENERGY_PERF_BIAS (1B0H).
410     */
411     uint32_t eax, ebx, ecx, edx;
412
413     cpuid(0x1, &eax, &ebx, &ecx, &edx);
414     machine_state.have_speedstep =  !!(ecx & (1 << 7));
415
416     cpuid(0x6, &eax, &ebx, &ecx, &edx);
417     machine_state.have_pstate_hw_coord =  !!(ecx & 1); // ?
418     machine_state.have_opportunistic =  !!(eax & 1<<1);
419     machine_state.have_policy_hint = !!(ecx & 1<<3);
420     machine_state.have_hwp = !!(eax & 1<<7);
421     machine_state.have_hdc = !!(eax & 1<<13);
422
423     cpuid(0x5, &eax, &ebx, &ecx, &edx);
424     machine_state.have_mwait_ext =  !!(ecx & 1);
425     machine_state.have_mwait_int =  !!(ecx & 1<<1);
426
427
428     INFO("P-State: Intel: Speedstep=%d, PstateHWCoord=%d, Opportunistic=%d PolicyHint=%d HWP=%d HDC=%d, MwaitExt=%d MwaitInt=%d \n",
429             machine_state.have_speedstep, 
430             machine_state.have_pstate_hw_coord, 
431             machine_state.have_opportunistic,
432             machine_state.have_policy_hint,
433             machine_state.have_hwp,
434             machine_state.have_hdc,
435             machine_state.have_mwait_ext,
436             machine_state.have_mwait_int );
437
438
439     if (machine_state.have_speedstep) {
440         uint32_t i;
441         // Build mapping table (from "pstate" (0..) to ctrl value for MSR
442         if (!(get_cpu_var(processors)) || !(get_cpu_var(processors)->performance) ) { 
443             put_cpu_var(processors);
444             // no acpi...  revert to internal table
445             intel_pstate_to_ctrl=intel_pstate_to_ctrl_internal;
446             intel_num_pstates=intel_num_pstates_internal;
447         } else {
448             intel_num_pstates = get_cpu_var(processors)->performance->state_count;
449             if (intel_num_pstates) { 
450                 intel_pstate_to_ctrl = palacios_alloc(sizeof(struct intel_pstate_info)*intel_num_pstates);
451                 if (!intel_pstate_to_ctrl) { 
452                     ERROR("P-State: Cannot allocate space for mapping...\n");
453                     intel_num_pstates=0;
454                 }
455                 for (i=0;i<intel_num_pstates;i++) { 
456                     intel_pstate_to_ctrl[i].freq = get_cpu_var(processors)->performance->states[i].core_frequency*1000;
457                     intel_pstate_to_ctrl[i].ctrl = get_cpu_var(processors)->performance->states[i].control;
458                 }
459                     
460             } else {
461                 ERROR("P-State: Strange, machine has ACPI DVFS but no states...\n");
462             }
463         }
464         put_cpu_var(processors);
465         INFO("P-State: Intel - State Mapping (%u states) follows\n",intel_num_pstates);
466         for (i=0;i<intel_num_pstates;i++) {
467             INFO("P-State: Intel Mapping %u:  freq=%llu  ctrl=%llx\n",
468                  i, intel_pstate_to_ctrl[i].freq,intel_pstate_to_ctrl[i].ctrl);
469         }
470     } else {
471         INFO("P-State: Intel:  No speedstep here\n");
472     }
473         
474
475     return machine_state.have_speedstep;
476 }
477
478
479 static void init_arch_intel(void)
480 {
481     uint64_t val;
482
483     rdmsrl(MSR_MISC_ENABLE_IA32, val);
484
485     //INFO("P-State: prior ENABLE=%llx\n",val);
486
487     // store prior speedstep setting
488     get_cpu_var(core_state).prior_speedstep=(val >> 16) & 0x1;
489     put_cpu_var(core_state);
490
491     // enable speedstep (probably already on)
492     val |= 1 << 16;
493     wrmsrl(MSR_MISC_ENABLE_IA32, val);
494
495     //INFO("P-State: write ENABLE=%llx\n",val);
496
497 }
498
499 static void deinit_arch_intel(void)
500 {
501     uint64_t val;
502
503     rdmsrl(MSR_MISC_ENABLE_IA32, val);
504
505     //INFO("P-State: deinit: ENABLE=%llx\n",val);
506
507     val &= ~(1ULL << 16);
508     val |= get_cpu_var(core_state).prior_speedstep << 16;
509     put_cpu_var(core_state);
510
511     wrmsrl(MSR_MISC_ENABLE_IA32, val);
512
513     //INFO("P-state: deinit ENABLE=%llx\n",val);
514
515 }
516
517 /* TODO: Intel P-states require sampling at intervals... */
518 static uint64_t get_pstate_intel(void)
519 {
520     uint64_t val;
521
522     rdmsrl(MSR_PERF_STAT_IA32,val);
523
524     //INFO("P-State: Get: 0x%llx\n", val);
525
526     // should check if turbo is active, in which case 
527     // this value is not the whole story
528
529     return val;
530 }
531
532 static void set_pstate_intel(uint64_t p)
533 {
534     uint64_t val;
535     uint64_t ctrl;
536
537     if (intel_num_pstates==0) { 
538         return ;
539     } else {
540         if (p>=intel_num_pstates) { 
541             p=intel_num_pstates-1;
542         }
543     }
544
545     ctrl=intel_pstate_to_ctrl[p].ctrl;
546
547     /* ...Intel IDA (dynamic acceleration)
548        if (c->no_turbo && !c->turbo_disabled) {
549        val |= 1 << 32;
550        }
551        */
552     // leave all bits along expect for the likely
553     // fid bits
554
555     rdmsrl(MSR_PERF_CTL_IA32, val);
556     INFO("P-State: Pre-Set: 0x%llx\n", val);
557
558     val &= ~0xffffULL;
559     val |= ctrl & 0xffffULL;
560
561     INFO("P-State: Set: 0x%llx\n", val);
562
563     wrmsrl(MSR_PERF_CTL_IA32, val);
564
565     get_cpu_var(core_state).cur_pstate = p;
566     put_cpu_var(core_state);
567 }
568
569
570 static uint64_t get_min_pstate_intel(void)
571 {
572     return 0;
573 }
574
575
576
577 static uint64_t get_max_pstate_intel (void)
578 {
579     if (intel_num_pstates==0) { 
580         return 0;
581     } else {
582         return intel_num_pstates-1;
583     }
584 }
585
586 static struct pstate_core_funcs intel_funcs =
587 {
588     .arch_init        = init_arch_intel,
589     .arch_deinit      = deinit_arch_intel,
590     .get_pstate       = get_pstate_intel,
591     .set_pstate       = set_pstate_intel,
592     .get_max_pstate   = get_max_pstate_intel,
593     .get_min_pstate   = get_min_pstate_intel,
594 };
595
596
597
598 /***********************************************
599   Arch determination and setup
600  ***********************************************/
601
602 static inline void cpuid_string (uint32_t id, uint32_t dest[4]) 
603 {
604     asm volatile("cpuid"
605             :"=a"(*dest),"=b"(*(dest+1)),"=c"(*(dest+2)),"=d"(*(dest+3))
606             :"a"(id));
607 }
608
609
610 static int get_cpu_vendor (char name[13])
611 {
612     uint32_t dest[4];
613     uint32_t maxid;
614
615     cpuid_string(0,dest);
616     maxid=dest[0];
617     ((uint32_t*)name)[0]=dest[1];
618     ((uint32_t*)name)[1]=dest[3];
619     ((uint32_t*)name)[2]=dest[2];
620     name[12]=0;
621
622     return maxid;
623 }
624
625
626 static int is_intel (void)
627 {
628     char name[13];
629     get_cpu_vendor(name);
630     return !strcmp(name,"GenuineIntel");
631 }
632
633
634 static int is_amd (void)
635 {
636     char name[13];
637     get_cpu_vendor(name);
638     return !strcmp(name,"AuthenticAMD");
639 }
640
641 static int pstate_arch_setup(void)
642 {
643
644     if (is_amd()) {
645         machine_state.arch = AMD;
646         machine_state.funcs = &amd_funcs;
647         machine_state.supports_pstates = supports_pstates_amd();
648         INFO("PSTATE: P-State initialized for AMD\n");
649     } else if (is_intel()) {
650         machine_state.arch  = INTEL;
651         machine_state.funcs = &intel_funcs;
652         machine_state.supports_pstates = supports_pstates_intel();
653         INFO("PSTATE: P-State initialized for INTEL (Work in progress...)\n");
654         return 0;
655
656     } else {
657         machine_state.arch = OTHER;
658         machine_state.funcs = NULL;
659         machine_state.supports_pstates = 0;
660         INFO("PSTATE: P-state control: No support for direct control on this architecture\n");
661         return 0;
662     }
663
664     return 0;
665 }
666
667
668
669 /******************************************************************
670   Linux Interface
671  *****************************************************************/
672
673 static unsigned cpus_using_v3_governor;
674 static DEFINE_MUTEX(v3_governor_mutex);
675
676 /* KCH: this will tell us when there is an actual frequency transition */
677 static int v3_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
678         void *data)
679 {
680     struct cpufreq_freqs *freq = data;
681
682     if (per_cpu(core_state, freq->cpu).mode != V3_PSTATE_EXTERNAL_CONTROL) {
683         return 0;
684     }
685
686     if (val == CPUFREQ_POSTCHANGE) {
687         DEBUG("P-State: frequency change took effect on cpu %u (now %u kHz)\n",
688                 freq->cpu, freq->new);
689         per_cpu(core_state, freq->cpu).cur_freq_khz = freq->new;
690     }
691
692     return 0;
693
694 }
695
696
697 static struct notifier_block v3_cpufreq_notifier_block = {
698     .notifier_call = v3_cpufreq_notifier
699 };
700
701
702 /* 
703  * This stub governor is simply a placeholder for preventing 
704  * frequency changes from the Linux side. For now, we simply leave
705  * the frequency as is when we acquire control. 
706  */
707 static int governor_run(struct cpufreq_policy *policy, unsigned int event)
708 {
709     unsigned cpu = policy->cpu;
710
711     switch (event) {
712         /* we can't use cpufreq_driver_target here as it can result
713          * in a circular dependency, so we'll keep the current frequency as is
714          */
715         case CPUFREQ_GOV_START:
716             BUG_ON(!policy->cur);
717
718             mutex_lock(&v3_governor_mutex);
719
720             if (cpus_using_v3_governor == 0) {
721                 cpufreq_register_notifier(&v3_cpufreq_notifier_block,
722                         CPUFREQ_TRANSITION_NOTIFIER);
723             }
724
725             cpus_using_v3_governor++;
726
727             per_cpu(core_state, cpu).set_freq_khz = policy->cur;
728             per_cpu(core_state, cpu).cur_freq_khz = policy->cur;
729             per_cpu(core_state, cpu).max_freq_khz = policy->max;
730             per_cpu(core_state, cpu).min_freq_khz = policy->min;
731
732             mutex_unlock(&v3_governor_mutex);
733             break;
734         case CPUFREQ_GOV_STOP:
735             mutex_lock(&v3_governor_mutex);
736
737             cpus_using_v3_governor--;
738
739             if (cpus_using_v3_governor == 0) {
740                 cpufreq_unregister_notifier(
741                         &v3_cpufreq_notifier_block,
742                         CPUFREQ_TRANSITION_NOTIFIER);
743             }
744
745             per_cpu(core_state, cpu).set_freq_khz = 0;
746             per_cpu(core_state, cpu).cur_freq_khz = 0;
747             per_cpu(core_state, cpu).max_freq_khz = 0;
748             per_cpu(core_state, cpu).min_freq_khz = 0;
749
750             mutex_unlock(&v3_governor_mutex);
751             break;
752         case CPUFREQ_GOV_LIMITS:
753             /* do nothing */
754             break;
755         default:
756             ERROR("Undefined governor command (%u)\n", event);
757             return -1;
758     }                           
759
760     return 0;
761 }
762
763
764 static struct cpufreq_governor stub_governor = 
765 {
766     .name = PALACIOS_GOVNAME,
767     .governor = governor_run,
768     .owner = THIS_MODULE,
769 };
770
771
772 static struct workqueue_struct *pstate_wq;
773
774 typedef struct {
775     struct work_struct work;
776     uint64_t freq;
777 } pstate_work_t;
778
779
780
781 static inline void pstate_register_linux_governor(void)
782 {
783     cpufreq_register_governor(&stub_governor);
784 }
785
786
787 static inline void pstate_unregister_linux_governor(void)
788 {
789     cpufreq_unregister_governor(&stub_governor);
790 }
791
792
793 static int pstate_linux_init(void)
794 {
795     pstate_register_linux_governor();
796     pstate_wq = create_workqueue("v3vee_pstate_wq");
797     if (!pstate_wq) {
798         ERROR("Could not create work queue\n");
799         goto out_err;
800     }
801
802     return 0;
803
804 out_err:
805     pstate_unregister_linux_governor();
806     return -1;
807 }
808
809
810 static void pstate_linux_deinit(void)
811 {
812     pstate_unregister_linux_governor();
813     flush_workqueue(pstate_wq);
814     destroy_workqueue(pstate_wq);
815 }
816
817
818 static int get_current_governor(char **buf, unsigned int cpu)
819 {
820     struct cpufreq_policy * policy = palacios_alloc(sizeof(struct cpufreq_policy));
821     char * govname = NULL;
822
823     if (!policy) {
824         ERROR("could not allocate cpufreq_policy\n");
825         return -1;
826     }
827         
828     if (cpufreq_get_policy(policy, cpu) != 0) {
829         ERROR("Could not get current cpufreq policy\n");
830         goto out_err;
831     }
832
833     /* We're in interrupt context, should probably not wait here */
834     govname = palacios_alloc(MAX_GOV_NAME_LEN);
835     if (!govname) {
836         ERROR("Could not allocate space for governor name\n");
837         goto out_err;
838     }
839
840     strncpy(govname, policy->governor->name, MAX_GOV_NAME_LEN);
841
842     get_cpu_var(core_state).linux_governor = govname;
843     put_cpu_var(core_state);
844
845     *buf = govname;
846
847     palacios_free(policy);
848
849     return 0;
850
851 out_err:
852     palacios_free(policy);
853     return -1;
854 }
855
856
857 /* passed to the userspacehelper interface for cleanup */
858 static void gov_switch_cleanup(struct subprocess_info * s)
859 {
860     palacios_free(s->argv[2]);
861     palacios_free(s->argv);
862 }
863
864
865 /* 
866  * Switch governors
867  * @s - the governor to switch to 
868  * TODO: this should probably be submitted to a work queue
869  * so we don't have to run it in interrupt context
870  */
871 static int governor_switch(char * s, unsigned int cpu)
872 {
873     char * path_str = NULL;
874     char ** argv = NULL; 
875
876     static char * envp[] = {
877         "HOME=/",
878         "TERM=linux",
879         "PATH=/sbin:/bin:/usr/sbin:/usr/bin", NULL };
880
881
882     argv = palacios_alloc(4*sizeof(char*));
883     if (!argv) {
884         ERROR("Couldn't allocate argv struct\n");
885         return -1;
886     }
887
888     path_str = palacios_alloc(MAX_PATH_LEN);
889     if (!path_str) {
890         ERROR("Couldn't allocate path string\n");
891         goto out_freeargv;
892     }
893     memset(path_str, 0, MAX_PATH_LEN);
894
895     snprintf(path_str, MAX_PATH_LEN, "echo %s > /sys/devices/system/cpu/cpu%u/cpufreq/scaling_governor", s, cpu);
896
897     argv[0] = "/bin/sh";
898     argv[1] = "-c";
899     argv[2] = path_str;
900     argv[3] = NULL;
901
902     /* KCH: we can't wait here to actually see if we succeeded, we're in interrupt context */
903     return call_usermodehelper_fns("/bin/sh", argv, envp, UMH_NO_WAIT, NULL, gov_switch_cleanup, NULL);
904
905 out_freeargv:
906     palacios_free(argv);
907     return -1;
908 }
909
910
911 static inline void free_linux_governor(void)
912 {
913     palacios_free(get_cpu_var(core_state).linux_governor);
914     put_cpu_var(core_state);
915 }
916
917
918 static int linux_setup_palacios_governor(void)
919 {
920     char * gov;
921     unsigned int cpu = get_cpu();
922     put_cpu();
923
924     /* KCH:  we assume the v3vee governor is already 
925      * registered with kernel by this point 
926      */
927
928     if (get_current_governor(&gov, cpu) < 0) {
929         ERROR("Could not get current governor\n");
930         return -1;
931     }
932
933     DEBUG("saving current governor (%s)\n", gov);
934
935     get_cpu_var(core_state).linux_governor = gov;
936     put_cpu_var(core_state);
937     
938     DEBUG("setting the new governor (%s)\n", PALACIOS_GOVNAME);
939
940     /* set the new one to ours */
941
942     if (governor_switch(PALACIOS_GOVNAME, cpu) < 0) {
943         ERROR("Could not set governor to (%s)\n", PALACIOS_GOVNAME);
944         return -1;
945     }
946
947     return 0;
948 }
949
950
951
952 static int linux_get_pstate(void)
953 {
954     struct cpufreq_policy * policy = NULL;
955     struct cpufreq_frequency_table *table;
956     unsigned int i = 0;
957     unsigned int count = 0;
958     unsigned int cpu = get_cpu(); 
959     put_cpu();
960
961
962     policy = palacios_alloc(sizeof(struct cpufreq_policy));
963     if (!policy) {
964         ERROR("Could not allocate policy struct\n");
965         return -1;
966     }
967
968     cpufreq_get_policy(policy, cpu);
969     table = cpufreq_frequency_get_table(cpu);
970
971     for (i = 0; (table[i].frequency != CPUFREQ_TABLE_END); i++) {
972
973         if (table[i].frequency == CPUFREQ_ENTRY_INVALID) {
974             continue;
975         }
976
977         if (table[i].frequency == policy->cur) {
978             break;
979         }
980
981         count++;
982     }
983
984     palacios_free(policy);
985
986     put_cpu();
987     return count;
988 }
989
990
991 static int linux_get_freq(void)
992 {
993     struct cpufreq_policy * policy = NULL;
994     unsigned int cpu = get_cpu();
995     put_cpu();
996
997     policy = palacios_alloc(sizeof(struct cpufreq_policy));
998     if (!policy) {
999         ERROR("Could not allocate policy struct\n");
1000         return -1;
1001     }
1002
1003     if (cpufreq_get_policy(policy, cpu)) {
1004         ERROR("Could not get current policy\n");
1005         return -1;
1006     }
1007
1008     return policy->cur;
1009 }
1010
1011 static void  
1012 pstate_switch_workfn (struct work_struct *work)
1013 {
1014     pstate_work_t * pwork = (pstate_work_t*)work;
1015     struct cpufreq_policy * policy = NULL;
1016     uint64_t freq; 
1017     unsigned int cpu = get_cpu();
1018     put_cpu();
1019
1020     mutex_lock(&v3_governor_mutex);
1021
1022     policy = palacios_alloc(sizeof(struct cpufreq_policy));
1023     if (!policy) {
1024         ERROR("Could not allocate space for cpufreq policy\n");
1025         goto out;
1026     }
1027
1028     if (cpufreq_get_policy(policy, cpu) != 0) {
1029         ERROR("Could not get cpufreq policy\n");
1030         goto out1;
1031     }
1032
1033     freq = pwork->freq;
1034     get_cpu_var(core_state).set_freq_khz = freq;
1035
1036     if (freq < get_cpu_var(core_state).min_freq_khz) {
1037         freq = get_cpu_var(core_state).min_freq_khz;
1038     }
1039     if (freq > get_cpu_var(core_state).max_freq_khz) {
1040         freq = get_cpu_var(core_state).max_freq_khz;
1041     }
1042     put_cpu_var(core_state);
1043
1044     INFO("P-state: requesting frequency change on core %u to %llu\n", cpu, freq);
1045     __cpufreq_driver_target(policy, freq, CPUFREQ_RELATION_L);
1046
1047 out1:
1048     palacios_free(policy);
1049 out:
1050     palacios_free(work);
1051     mutex_unlock(&v3_governor_mutex);
1052
1053
1054
1055 static int linux_set_pstate(uint8_t p)
1056 {
1057     struct cpufreq_policy * policy = NULL;
1058     struct cpufreq_frequency_table *table;
1059     pstate_work_t * work = NULL;
1060     unsigned int i = 0;
1061     unsigned int count = 0;
1062     int state_set = 0;
1063     int last_valid = 0;
1064     unsigned int cpu = get_cpu();
1065     put_cpu();
1066
1067     policy = palacios_alloc(sizeof(struct cpufreq_policy));
1068     if (!policy) {
1069         ERROR("Could not allocate policy struct\n");
1070         return -1;
1071     }
1072
1073     work = (pstate_work_t*)palacios_alloc(sizeof(pstate_work_t));
1074     if (!work) {
1075         ERROR("Could not allocate work struct\n");
1076         goto out_err;
1077     }
1078
1079     if (cpufreq_get_policy(policy, cpu)) {
1080         ERROR("Could not get current policy\n");
1081         goto out_err1;
1082     }
1083     table = cpufreq_frequency_get_table(cpu);
1084
1085     for (i = 0; (table[i].frequency != CPUFREQ_TABLE_END); i++) {
1086
1087         if (table[i].frequency == CPUFREQ_ENTRY_INVALID) {
1088             continue;
1089         }
1090
1091         if (count == p) {
1092
1093             INIT_WORK((struct work_struct*)work, pstate_switch_workfn);
1094             work->freq = table[i].frequency;
1095             queue_work(pstate_wq, (struct work_struct*)work);
1096
1097             state_set = 1;
1098             break;
1099         }
1100
1101         count++;
1102         last_valid = i;
1103     }
1104
1105     /* we need to deal with the case in which we get a number > max pstate */
1106     if (!state_set) {
1107         INIT_WORK((struct work_struct*)work, pstate_switch_workfn);
1108         work->freq = table[last_valid].frequency;
1109         queue_work(pstate_wq, (struct work_struct*)work);
1110     }
1111
1112     palacios_free(policy);
1113     return 0;
1114
1115 out_err1: 
1116     palacios_free(work);
1117 out_err:
1118     palacios_free(policy);
1119     return -1;
1120 }
1121
1122
1123 static int linux_set_freq(uint64_t f)
1124 {
1125     struct cpufreq_policy * policy = NULL;
1126     pstate_work_t * work = NULL;
1127     uint64_t freq;
1128     unsigned int cpu = get_cpu();
1129     put_cpu();
1130
1131     policy = palacios_alloc(sizeof(struct cpufreq_policy));
1132     if (!policy) {
1133         ERROR("Could not allocate policy struct\n");
1134         return -1;
1135     }
1136
1137     work = (pstate_work_t*)palacios_alloc(sizeof(pstate_work_t));
1138     if (!work) {
1139         ERROR("Could not allocate work struct\n");
1140         goto out_err;
1141     }
1142
1143     if (cpufreq_get_policy(policy, cpu) != 0) {
1144         ERROR("Could not get cpufreq policy\n");
1145         goto out_err1;
1146     }
1147
1148     if (f < policy->min) {
1149         freq = policy->min;
1150     } else if (f > policy->max) {
1151         freq = policy->max;
1152     } else {
1153         freq = f;
1154     }
1155
1156     INIT_WORK((struct work_struct*)work, pstate_switch_workfn);
1157     work->freq = freq;
1158     queue_work(pstate_wq, (struct work_struct*)work);
1159
1160     palacios_free(policy);
1161     return 0;
1162
1163 out_err1:
1164     palacios_free(work);
1165 out_err:
1166     palacios_free(policy);
1167     return -1;
1168 }
1169
1170
1171 static int linux_restore_defaults(void)
1172 {
1173     char * gov = NULL;
1174     unsigned int cpu = get_cpu();
1175     put_cpu();
1176
1177     gov = get_cpu_var(core_state).linux_governor;
1178     put_cpu_var(core_state);
1179
1180     DEBUG("restoring previous governor (%s)\n", gov);
1181
1182     if (governor_switch(gov, cpu) < 0) {
1183         ERROR("Could not restore governor to (%s)\n", gov);
1184         goto out_err;
1185     }
1186
1187     free_linux_governor();
1188     return 0;
1189
1190 out_err:
1191     free_linux_governor();
1192     return -1;
1193 }
1194
1195
1196
1197 /******************************************************************
1198   Generic Interface as provided to Palacios and to the rest of the
1199   module
1200  ******************************************************************/
1201
1202 static void init_core(void)
1203 {
1204     unsigned cpu;
1205     struct cpufreq_policy *p;
1206     unsigned int i;
1207
1208
1209     DEBUG("P-State Core Init\n");
1210
1211     get_cpu_var(core_state).mode = V3_PSTATE_HOST_CONTROL;
1212     get_cpu_var(core_state).cur_pstate = 0;
1213
1214     if (machine_state.funcs) {
1215         get_cpu_var(core_state).min_pstate = machine_state.funcs->get_min_pstate();
1216         get_cpu_var(core_state).max_pstate = machine_state.funcs->get_max_pstate();
1217     } else {
1218         get_cpu_var(core_state).min_pstate = 0;
1219         get_cpu_var(core_state).max_pstate = 0;
1220     }
1221
1222
1223     cpu = get_cpu(); put_cpu();
1224
1225     p = cpufreq_cpu_get(cpu);
1226
1227     if (!p) { 
1228         get_cpu_var(core_state).have_cpufreq = 0;
1229         get_cpu_var(core_state).min_freq_khz=0;
1230         get_cpu_var(core_state).max_freq_khz=0;
1231         get_cpu_var(core_state).cur_freq_khz=0;
1232     } else {
1233         get_cpu_var(core_state).have_cpufreq = 1;
1234         get_cpu_var(core_state).min_freq_khz=p->min;
1235         get_cpu_var(core_state).max_freq_khz=p->max;
1236         get_cpu_var(core_state).cur_freq_khz=p->cur; } cpufreq_cpu_put(p); 
1237     put_cpu_var(core_state);
1238
1239     for (i=0;i<get_cpu_var(processors)->performance->state_count; i++) { 
1240         INFO("P-State: %u: freq=%llu ctrl=%llx",
1241                 i, 
1242                 get_cpu_var(processors)->performance->states[i].core_frequency*1000,
1243                 get_cpu_var(processors)->performance->states[i].control);
1244    }
1245    put_cpu_var(processors);
1246 }
1247
1248
1249 void palacios_pstate_ctrl_release(void);
1250
1251
1252 static void deinit_core(void)
1253 {
1254     DEBUG("P-State Core Deinit\n");
1255     palacios_pstate_ctrl_release();
1256
1257 }
1258
1259
1260
1261 void palacios_pstate_ctrl_get_chars(struct v3_cpu_pstate_chars *c) 
1262 {
1263     memset(c,0,sizeof(struct v3_cpu_pstate_chars));
1264
1265
1266     c->features = V3_PSTATE_INTERNAL_CONTROL;
1267
1268     if (get_cpu_var(core_state).have_cpufreq) {
1269         c->features |= V3_PSTATE_EXTERNAL_CONTROL;
1270     }
1271
1272     if (machine_state.arch==AMD || machine_state.arch==INTEL) { 
1273         c->features |= V3_PSTATE_DIRECT_CONTROL;
1274     }
1275     c->cur_mode = get_cpu_var(core_state).mode;
1276     c->min_pstate = get_cpu_var(core_state).min_pstate;
1277     c->max_pstate = get_cpu_var(core_state).max_pstate;
1278     c->cur_pstate = get_cpu_var(core_state).cur_pstate;
1279     c->min_freq_khz = get_cpu_var(core_state).min_freq_khz;
1280     c->max_freq_khz = get_cpu_var(core_state).max_freq_khz;
1281     c->cur_freq_khz = get_cpu_var(core_state).cur_freq_khz;
1282
1283     put_cpu_var(core_state);
1284
1285
1286
1287 }
1288
1289
1290 uint64_t palacios_pstate_ctrl_get_pstate(void)
1291 {
1292     if (get_cpu_var(core_state).mode==V3_PSTATE_DIRECT_CONTROL) { 
1293         put_cpu_var(core_state);
1294         return machine_state.funcs->get_pstate();
1295     } else if (get_cpu_var(core_state).mode==V3_PSTATE_EXTERNAL_CONTROL) {
1296         put_cpu_var(core_state);
1297         return linux_get_pstate();
1298     } else {
1299         put_cpu_var(core_state);
1300         return 0;
1301     }
1302 }
1303
1304
1305 void palacios_pstate_ctrl_set_pstate(uint64_t p)
1306 {
1307     if (get_cpu_var(core_state).mode==V3_PSTATE_DIRECT_CONTROL) { 
1308         put_cpu_var(core_state);
1309         machine_state.funcs->set_pstate(p);
1310     } else if (get_cpu_var(core_state).mode==V3_PSTATE_EXTERNAL_CONTROL) {
1311         put_cpu_var(core_state);
1312         linux_set_pstate(p);
1313     } else {
1314         put_cpu_var(core_state);
1315     }
1316 }
1317
1318
1319 void palacios_pstate_ctrl_set_pstate_wrapper(void *p)
1320 {
1321     palacios_pstate_ctrl_set_pstate((uint8_t)(uint64_t)p);
1322 }
1323
1324
1325 uint64_t palacios_pstate_ctrl_get_freq(void)
1326 {
1327     if (get_cpu_var(core_state).mode==V3_PSTATE_EXTERNAL_CONTROL) { 
1328         put_cpu_var(core_state);
1329         return linux_get_freq();
1330     } else {
1331         put_cpu_var(core_state);
1332         return 0;
1333     }
1334 }
1335
1336
1337 void palacios_pstate_ctrl_set_freq(uint64_t p)
1338 {
1339     if (get_cpu_var(core_state).mode==V3_PSTATE_EXTERNAL_CONTROL) { 
1340         put_cpu_var(core_state);
1341         linux_set_freq(p);
1342     } else {
1343         put_cpu_var(core_state);
1344     }
1345 }
1346
1347
1348 static int switch_to_external(void)
1349 {
1350     DEBUG("switch from host control to external\n");
1351
1352     if (!(get_cpu_var(core_state).have_cpufreq)) {
1353         put_cpu_var(core_state);
1354         ERROR("No cpufreq  - cannot switch to external...\n");
1355         return -1;
1356     } 
1357     put_cpu_var(core_state);
1358
1359     linux_setup_palacios_governor();
1360
1361     get_cpu_var(core_state).mode=V3_PSTATE_EXTERNAL_CONTROL;
1362     put_cpu_var(core_state);
1363
1364     return 0;
1365 }
1366
1367
1368 static int switch_to_direct(void)
1369 {
1370     DEBUG("switch from host control to direct\n");
1371
1372     if (get_cpu_var(core_state).have_cpufreq) { 
1373         put_cpu_var(core_state);
1374         DEBUG("switch to direct from cpufreq\n");
1375
1376         // The implementation would set the policy and governor to peg cpu
1377         // regardless of load
1378         linux_setup_palacios_governor();
1379     } else {
1380         put_cpu_var(core_state);
1381     }
1382
1383     if (machine_state.funcs && machine_state.funcs->arch_init) {
1384         get_cpu_var(core_state).mode=V3_PSTATE_DIRECT_CONTROL;
1385
1386         machine_state.funcs->arch_init();
1387
1388         put_cpu_var(core_state);
1389     }
1390
1391     return 0;
1392 }
1393
1394
1395 static int switch_to_internal(void)
1396 {
1397     DEBUG("switch from host control to internal\n");
1398
1399     if (get_cpu_var(core_state).have_cpufreq) { 
1400         put_cpu_var(core_state);
1401         DEBUG("switch to internal on machine with cpu freq\n");
1402         linux_setup_palacios_governor();
1403     } else {
1404         put_cpu_var(core_state);
1405     }
1406
1407     get_cpu_var(core_state).mode=V3_PSTATE_INTERNAL_CONTROL;
1408
1409     put_cpu_var(core_state);
1410
1411     return 0;
1412 }
1413
1414
1415 static int switch_from_external(void)
1416 {
1417     if (!(get_cpu_var(core_state).have_cpufreq)) {
1418         put_cpu_var(core_state);
1419         ERROR("No cpufreq  - how did we get here... external...\n");
1420         return -1;
1421     }
1422     put_cpu_var(core_state);
1423
1424     DEBUG("Switching back to host control from external\n");
1425
1426     if (get_cpu_var(core_state).have_cpufreq) { 
1427         put_cpu_var(core_state);
1428         linux_restore_defaults();
1429     } else {
1430         put_cpu_var(core_state);
1431     }
1432
1433     get_cpu_var(core_state).mode = V3_PSTATE_HOST_CONTROL;
1434     put_cpu_var(core_state);
1435
1436     return 0;
1437 }
1438
1439
1440 static int switch_from_direct(void)
1441 {
1442
1443     DEBUG("Switching back to host control from direct\n");
1444
1445     // Set maximum performance, just in case there is no host control
1446     machine_state.funcs->set_pstate(get_cpu_var(core_state).min_pstate);
1447     machine_state.funcs->arch_deinit();
1448
1449     if (get_cpu_var(core_state).have_cpufreq) { 
1450         put_cpu_var(core_state);
1451         linux_restore_defaults();
1452     } else {
1453         put_cpu_var(core_state);
1454     }
1455
1456     get_cpu_var(core_state).mode=V3_PSTATE_HOST_CONTROL;
1457
1458     put_cpu_var(core_state);
1459
1460     return 0;
1461 }
1462
1463
1464 static int switch_from_internal(void)
1465 {
1466     DEBUG("Switching back to host control from internal\n");
1467
1468     if (get_cpu_var(core_state).have_cpufreq) { 
1469         put_cpu_var(core_state);
1470         // ERROR("Unimplemented: switch from internal on machine with cpu freq - will just pretend to do so\n");
1471         // The implementation would switch back to default policy and governor
1472         linux_restore_defaults();
1473     } else {
1474         put_cpu_var(core_state);
1475     }
1476
1477     get_cpu_var(core_state).mode=V3_PSTATE_HOST_CONTROL;
1478
1479     put_cpu_var(core_state);
1480
1481     return 0;
1482 }
1483
1484
1485
1486 void palacios_pstate_ctrl_acquire(uint32_t type)
1487 {
1488     if (get_cpu_var(core_state).mode != V3_PSTATE_HOST_CONTROL) { 
1489         put_cpu_var(core_state);
1490         palacios_pstate_ctrl_release();
1491     } else {
1492         put_cpu_var(core_state);
1493     }
1494
1495     switch (type) { 
1496         case V3_PSTATE_EXTERNAL_CONTROL:
1497             switch_to_external();
1498             break;
1499         case V3_PSTATE_DIRECT_CONTROL:
1500             switch_to_direct();
1501             break;
1502         case V3_PSTATE_INTERNAL_CONTROL:
1503             switch_to_internal();
1504             break;
1505         default:
1506             ERROR("Unknown pstate control type %u\n",type);
1507             break;
1508     }
1509
1510 }
1511
1512 // Wrappers for xcalls
1513 static void palacios_pstate_ctrl_acquire_external(void)
1514 {
1515     palacios_pstate_ctrl_acquire(V3_PSTATE_EXTERNAL_CONTROL);
1516 }
1517
1518 static void palacios_pstate_ctrl_acquire_direct(void)
1519 {
1520     palacios_pstate_ctrl_acquire(V3_PSTATE_DIRECT_CONTROL);
1521 }
1522
1523
1524 void palacios_pstate_ctrl_release(void)
1525 {
1526     if (get_cpu_var(core_state).mode == V3_PSTATE_HOST_CONTROL) { 
1527         put_cpu_var(core_state);
1528         return;
1529     } 
1530     put_cpu_var(core_state);
1531
1532     switch (get_cpu_var(core_state).mode) { 
1533         case V3_PSTATE_EXTERNAL_CONTROL:
1534             put_cpu_var(core_state);
1535             switch_from_external();
1536             break;
1537         case V3_PSTATE_DIRECT_CONTROL:
1538             put_cpu_var(core_state);
1539             switch_from_direct();
1540             break;
1541         case V3_PSTATE_INTERNAL_CONTROL:
1542             put_cpu_var(core_state);
1543             switch_from_internal();
1544             break;
1545         default:
1546             put_cpu_var(core_state);
1547             ERROR("Unknown pstate control type %u\n",core_state.mode);
1548             break;
1549     }
1550 }
1551
1552
1553 static void update_hw_pstate(void *arg)
1554 {
1555     if (machine_state.funcs && machine_state.funcs->get_pstate) {
1556         get_cpu_var(core_state).cur_hw_pstate = machine_state.funcs->get_pstate();
1557         put_cpu_var(core_state);
1558     } else {
1559         get_cpu_var(core_state).cur_hw_pstate = 0;
1560         put_cpu_var(core_state);
1561     }
1562 }
1563
1564
1565 /***************************************************************************
1566   PROC Interface to expose state
1567  ***************************************************************************/
1568
1569 static int pstate_show(struct seq_file * file, void * v)
1570 {
1571     unsigned int cpu;
1572     unsigned int numcpus = num_online_cpus();
1573
1574     seq_printf(file, "V3VEE DVFS Status\n\n");
1575
1576     for (cpu=0;cpu<numcpus;cpu++) { 
1577         palacios_xcall(cpu,update_hw_pstate,0);
1578     }
1579
1580     seq_printf(file, "Arch:\t%s\nPStates:\t%s\n\n",
1581             machine_state.arch==INTEL ? "Intel" : 
1582             machine_state.arch==AMD ? "AMD" : "Other",
1583             machine_state.supports_pstates ? "Yes" : "No");
1584
1585     for (cpu=0;cpu<numcpus;cpu++) { 
1586         struct pstate_core_info *s = &per_cpu(core_state,cpu);
1587         seq_printf(file,"pcore %u: hw pstate 0x%x mode %s of [ host ",cpu,
1588                 s->cur_hw_pstate,
1589                 s->mode==V3_PSTATE_HOST_CONTROL ? "host" :
1590                 s->mode==V3_PSTATE_EXTERNAL_CONTROL ? "external" :
1591                 s->mode==V3_PSTATE_DIRECT_CONTROL ? "direct" : 
1592                 s->mode==V3_PSTATE_INTERNAL_CONTROL ? "internal" : "UNKNOWN");
1593         if (s->have_cpufreq) { 
1594             seq_printf(file,"external ");
1595         }
1596         if (machine_state.supports_pstates) {
1597             seq_printf(file,"direct ");
1598         }
1599         seq_printf(file,"internal ] ");
1600         if (s->mode==V3_PSTATE_EXTERNAL_CONTROL) { 
1601             seq_printf(file,"(min=%llu max=%llu cur=%llu) ", s->min_freq_khz, s->max_freq_khz, s->cur_freq_khz);
1602         } 
1603         if (s->mode==V3_PSTATE_DIRECT_CONTROL) { 
1604             seq_printf(file,"(min=%u max=%u cur=%u) ", (uint32_t)s->min_pstate, (uint32_t)s->max_pstate, (uint32_t)s->cur_pstate);
1605         }
1606         seq_printf(file,"\n");
1607     }
1608     return 0;
1609 }
1610
1611 static int pstate_open(struct inode * inode, struct file * file) 
1612 {
1613     return single_open(file, pstate_show, NULL);
1614 }
1615
1616
1617 static struct file_operations pstate_fops = {
1618     .owner = THIS_MODULE,
1619     .open = pstate_open, 
1620     .read = seq_read,
1621     .llseek = seq_lseek,
1622     .release = seq_release
1623 };
1624
1625 int pstate_proc_setup(void)
1626 {
1627     struct proc_dir_entry *proc;
1628
1629     proc = create_proc_entry("v3-dvfs",0444, palacios_get_procdir());
1630
1631     if (!proc) { 
1632         ERROR("Failed to create proc entry for p-state control\n");
1633         return -1;
1634     }
1635
1636     proc->proc_fops = &pstate_fops;
1637
1638     return 0;
1639 }
1640
1641 void pstate_proc_teardown(void)
1642 {
1643     remove_proc_entry("v3-dvfs",palacios_get_procdir());
1644 }
1645
1646 /********************************************************************
1647   User interface (ioctls)
1648  ********************************************************************/
1649
1650 static int dvfs_ctrl(unsigned int cmd, unsigned long arg) 
1651 {
1652     struct v3_dvfs_ctrl_request r;
1653
1654     if (copy_from_user(&r,(void __user*)arg,sizeof(struct v3_dvfs_ctrl_request))) {
1655         ERROR("Failed to copy DVFS request from user\n");
1656         return -EFAULT;
1657     }
1658
1659     if (r.pcore >= num_online_cpus()) {
1660         ERROR("Cannot apply DVFS request to pcore %u\n",r.pcore);
1661         return -EFAULT;
1662     }
1663
1664     switch (r.cmd) {
1665         case V3_DVFS_ACQUIRE: {
1666                                   switch (r.acq_type) { 
1667                                       case V3_DVFS_EXTERNAL:
1668                                           palacios_xcall(r.pcore,(void (*)(void*))palacios_pstate_ctrl_acquire_external, NULL);
1669                                           return 0;
1670                                           break;
1671                                       case V3_DVFS_DIRECT:
1672                                           palacios_xcall(r.pcore,(void (*)(void*))palacios_pstate_ctrl_acquire_direct, NULL);
1673                                           return 0;
1674                                           break;
1675                                       default:
1676                                           ERROR("Unknown DVFS acquire type %u\n",r.acq_type);
1677                                           return -EFAULT;
1678                                   }
1679                               }
1680                               break;
1681         case V3_DVFS_RELEASE: {
1682                                   palacios_xcall(r.pcore,(void (*)(void*))palacios_pstate_ctrl_release, NULL);
1683                                   return 0;
1684                               }
1685                               break;
1686         case V3_DVFS_SETFREQ: {
1687                                   palacios_xcall(r.pcore,(void (*)(void*))palacios_pstate_ctrl_set_freq,(void*)r.freq_khz);
1688                                   return 0;
1689                               }
1690                               break;
1691         case V3_DVFS_SETPSTATE: {
1692                                     palacios_xcall(r.pcore,(void (*)(void*))palacios_pstate_ctrl_set_pstate_wrapper,(void*)(uint64_t)r.pstate);
1693                                     return 0;
1694                                 }
1695         default: {
1696                      ERROR("Unknown DVFS command %u\n",r.cmd);
1697                      return -EFAULT;
1698                  }
1699                  break;
1700     }
1701 }
1702
1703
1704 void pstate_user_setup(void)
1705 {
1706     add_global_ctrl(V3_DVFS_CTRL, dvfs_ctrl);
1707 }
1708
1709
1710 void pstate_user_teardown(void)
1711 {
1712     remove_global_ctrl(V3_DVFS_CTRL);
1713 }
1714
1715 static struct v3_host_pstate_ctrl_iface hooks = {
1716     .get_chars = palacios_pstate_ctrl_get_chars,
1717     .acquire = palacios_pstate_ctrl_acquire,
1718     .release = palacios_pstate_ctrl_release,
1719     .set_pstate = palacios_pstate_ctrl_set_pstate,
1720     .get_pstate = palacios_pstate_ctrl_get_pstate,
1721     .set_freq = palacios_pstate_ctrl_set_freq,
1722     .get_freq = palacios_pstate_ctrl_get_freq,
1723 };
1724
1725
1726
1727 static int pstate_ctrl_init(void) 
1728 {
1729     unsigned int cpu;
1730     unsigned int numcpus = num_online_cpus();
1731
1732     pstate_arch_setup();
1733
1734     for (cpu=0;cpu<numcpus;cpu++) { 
1735         palacios_xcall(cpu,(void ((*)(void*)))init_core,0);
1736     }
1737
1738     V3_Init_Pstate_Ctrl(&hooks);  
1739
1740     if (pstate_proc_setup()) { 
1741         ERROR("Unable to initialize P-State Control\n");
1742         return -1;
1743     }
1744
1745     pstate_user_setup();
1746
1747     pstate_linux_init();
1748
1749     INFO("P-State Control Initialized\n");
1750
1751     return 0;
1752 }
1753
1754 static int pstate_ctrl_deinit(void)
1755 {
1756     unsigned int cpu;
1757     unsigned int numcpus=num_online_cpus();
1758
1759     pstate_linux_deinit();
1760
1761     pstate_user_teardown();
1762
1763     pstate_proc_teardown();
1764
1765     // release pstate control if we have it, and we need to do this on each processor
1766     for (cpu=0;cpu<numcpus;cpu++) { 
1767         palacios_xcall(cpu,(void (*)(void *))deinit_core,0);
1768     }
1769
1770
1771     // Free any mapping table we built for Intel
1772     if (intel_pstate_to_ctrl && intel_pstate_to_ctrl != intel_pstate_to_ctrl_internal) { 
1773         palacios_free(intel_pstate_to_ctrl);
1774     }
1775
1776
1777     return 0;
1778 }
1779
1780
1781 static struct linux_ext pstate_ext = {
1782     .name = "PSTATE_CTRL",
1783     .init = pstate_ctrl_init,
1784     .deinit = pstate_ctrl_deinit,
1785     .guest_init = NULL,
1786     .guest_deinit = NULL,
1787 };
1788
1789
1790 register_extension(&pstate_ext);
1791
1792
1793