Palacios Public Git Repository

To checkout Palacios execute

  git clone http://v3vee.org/palacios/palacios.web/palacios.git
This will give you the master branch. You probably want the devel branch or one of the release branches. To switch to the devel branch, simply execute
  cd palacios
  git checkout --track -b devel origin/devel
The other branches are similar.


Linux kernel compatability enhancements (through 3.19)
[palacios.git] / linux_module / iface-pstate-ctrl.c
1 /*
2  * This file is part of the Palacios Virtual Machine Monitor developed
3  * by the V3VEE Project with funding from the United States National 
4  * Science Foundation and the Department of Energy.  
5  *
6  * The V3VEE Project is a joint project between Northwestern University
7  * and the University of New Mexico.  You can find out more at 
8  * http://www.v3vee.org
9  *
10  * Copyright (c) 2014, the V3VEE Project <http://www.v3vee.org>
11  * all rights reserved.
12  *
13  * Author: Kyle C. Hale <kh@u.northwestern.edu>
14  *         Shiva Rao <shiva.rao.717@gmail.com>
15  *         Peter Dinda <pdinda@northwestern.edu>
16  *
17  * This is free software.  you are permitted to use,
18  * redistribute, and modify it as specified in the file "V3VEE_LICENSE".
19  */
20
21 #include <linux/uaccess.h>
22 #include <linux/seq_file.h>
23 #include <linux/proc_fs.h>
24 #include <linux/cpufreq.h>
25 #include <linux/kernel.h>
26 #include <linux/kmod.h>
27 #include <linux/module.h>
28 #include <linux/string.h>
29 #include <linux/interrupt.h>
30 #include <asm/processor.h>
31 #include <asm/msr.h>
32 #include <asm/msr-index.h>
33
34 // Used to determine the appropriate pstates values on Intel
35 #include <linux/acpi.h>
36 #include <acpi/processor.h>
37
38 #include <interfaces/vmm_pstate_ctrl.h>
39
40 #include "palacios.h"
41 #include "iface-pstate-ctrl.h"
42
43 #include "linux-exts.h"
44
45 /*
46    This P-STATE control implementation includes the following modes.
47    You can switch between modes at any time.
48
49    - Internal control of processor states in Palacios (handoff from Linux)
50      When Palacios acuires this control, this module disables Linux cpufreq control
51      and allows code within Palacios unfettered access to the DVFS hardware. 
52    - Direct control of Intel and AMD processor pstates using code in this module
53      When you acquire this control, this module disables Linux cpufreq control
54      and directly programs the processor itself in response to your requests
55    - External control of processor states via Linux 
56      When you acuire this control, this module uses the Linux cpufreq control
57      to program the processor on your behelf
58    - Host control of processor stastes
59      This is the normal mode of DVFS control (e.g., Linux cpufreq)
60
61    Additionally, it provides a user-space interface for manipulating
62    p-state regardless of the host's functionality.  This includes
63    an ioctl for commanding the implementation and a /proc file for 
64    showing current status and capabilities.  From user space, you can
65    use the Direct, External, and Host modes.  
66
67    What we mean by "p-state" here is the processor's internal
68    configuration.   For AMD, this is defined as being the same as
69    the ACPI-defined p-state.  For Intel, it is not.  There, it is the 
70    contents of the perf ctl MSR, which is opaque.   We try hard to 
71    provide "p-states" that go from 0...max, by analogy or equivalence
72    to the ACPI p-states. 
73
74 */
75
76
77 #define PALACIOS_GOVNAME "v3vee"
78 #define MAX_PATH_LEN     128
79 #define MAX_GOV_NAME_LEN 16
80
81
82 struct pstate_core_info {
83     // Here we have the notion of host control
84 #define V3_PSTATE_HOST_CONTROL 0
85     // and all the modes from the Palacios interface:
86     // V3_PSTATE_EXTERNAL_CONTROL
87     // V3_PSTATE_DIRECT_CONTROL
88     // V3_PSTATE_INTERNAL_CONTROL
89     uint32_t mode;
90
91     // Apply if we are under the DIRECT state
92     uint64_t cur_pstate;
93     uint64_t max_pstate;
94     uint64_t min_pstate;
95
96     uint64_t cur_hw_pstate;
97
98     // Apply if we are under the EXTERNAL state
99     uint64_t set_freq_khz; // this is the frequency we're hoping to get
100     uint64_t cur_freq_khz;
101     uint64_t max_freq_khz;
102     uint64_t min_freq_khz;
103
104     // Intel-specific
105     uint8_t prior_speedstep;
106     uint8_t turbo_disabled;
107     uint8_t no_turbo;
108
109     int have_cpufreq;
110
111     // This is where we stash Linux's governor when we make a mode switch
112     char * linux_governor;
113     // We have this so we can restore the original frequency when we started
114     uint64_t original_hz; 
115
116 };
117
118
119 static DEFINE_PER_CPU(struct pstate_core_info, core_state);
120
121
122
123 // These are used to assert DIRECT control over the core pstates
124 struct pstate_core_funcs {
125     void    (*arch_init)(void);
126     void    (*arch_deinit)(void);
127     uint64_t (*get_min_pstate)(void);
128     uint64_t (*get_max_pstate)(void);
129     uint64_t (*get_pstate)(void);
130     void    (*set_pstate)(uint64_t pstate);
131 };
132
133 struct pstate_machine_info {
134     enum {INTEL, AMD, OTHER } arch;
135     int supports_pstates;
136
137
138     // For AMD
139     int have_pstate;
140     int have_coreboost;
141     int have_feedback;  
142
143     // For Intel
144     int have_speedstep;
145     int have_opportunistic; // this means "Turbo Boost" or "IDA"
146     int have_policy_hint;
147     int have_hwp;       // hardware-controlled performance states
148     int have_hdc;       // hardware duty cycling
149     int have_mwait_ext; // mwait power extensions
150     int have_mwait_int; // mwait wakes on interrupt
151
152     // for both
153     int have_pstate_hw_coord;  // mperf/aperf
154
155     // used for DIRECT control
156     struct pstate_core_funcs *funcs;
157
158 };
159
160 static struct pstate_machine_info machine_state;
161
162
163 /****************************************************
164   AMD  DIRECT CONTROL
165  ***************************************************/
166
167 /* AMD Programmer's Manual Vol 2 (Rev 3, 2013), Sec. 17.1, pp.557 */
168 #define MSR_PSTATE_LIMIT_REG_AMD 0xc0010061
169 #define MSR_PSTATE_CTL_REG_AMD   0xc0010062
170 #define MSR_PSTATE_STAT_REG_AMD  0xc0010063
171
172 struct p_state_limit_reg_amd {
173     union {
174         uint64_t val;
175         struct {
176             uint8_t  pstate_limit : 4; /* lowest P-state value (highest perf.) supported currently (this can change at runtime) */
177             uint8_t  pstate_max   : 4; /* highest P-state value supported  (lowest perf) */
178             uint64_t rsvd         : 56;
179         } reg;
180     } __attribute__((packed));
181 } __attribute__((packed));
182
183
184 struct p_state_stat_reg_amd {
185     union {
186         uint64_t val;
187         struct {
188             uint8_t  pstate  : 4;
189             uint64_t rsvd    : 60;
190         } reg;
191     } __attribute__((packed));
192 } __attribute__((packed));
193
194
195 struct p_state_ctl_reg_amd {
196     union {
197         uint64_t val;
198         struct {
199             uint8_t  cmd  : 4;
200             uint64_t rsvd : 60;
201         } reg;
202     } __attribute__((packed));
203 } __attribute__((packed));
204
205
206 /* CPUID Fn8000_0007_EDX[HwPstate(7)] = 1 */
207 static uint8_t supports_pstates_amd (void)
208 {
209     int i;
210     int mapwrong=0;
211     int amd_num_pstates;
212
213     uint32_t eax, ebx, ecx, edx;
214
215     cpuid(0x80000007, &eax, &ebx, &ecx, &edx);
216     machine_state.have_pstate = !!(edx & (1 << 7));
217     machine_state.have_coreboost = !!(edx & (1<<9));
218     machine_state.have_feedback = !!(edx & (1<<11));
219
220     cpuid(0x6, &eax, &ebx, &ecx, &edx);
221     machine_state.have_pstate_hw_coord =  !!(ecx & 1); 
222
223     INFO("P-State: AMD: Pstates=%d Coreboost=%d Feedback=%d PstateHWCoord=%d\n",
224             machine_state.have_pstate, 
225             machine_state.have_coreboost, 
226             machine_state.have_feedback,
227             machine_state.have_pstate_hw_coord);
228
229     amd_num_pstates = get_cpu_var(processors)->performance->state_count;
230     if (amd_num_pstates) { 
231         for (i=0;i<amd_num_pstates;i++) { 
232             INFO("P-State: %u: freq=%llu ctrl=%llx%s\n",
233                  i, 
234                  get_cpu_var(processors)->performance->states[i].core_frequency*1000,
235                  get_cpu_var(processors)->performance->states[i].control,
236                  get_cpu_var(processors)->performance->states[i].control != i ? (mapwrong=1, " ALERT - CTRL MAPPING NOT 1:1") : "");
237         }
238     }
239     if (mapwrong) { 
240         ERROR("P-State: AMD: mapping of pstate and control is not 1:1 on this processor - we will probably not work corrrectly\n");
241     }
242
243     return machine_state.have_pstate;
244
245
246 }
247
248
249 static void init_arch_amd(void)
250 {
251     /* KCH: nothing to do here */
252 }
253
254
255 static void deinit_arch_amd(void)
256 {
257     /* KCH: nothing to do here */
258 }
259
260
261 static uint64_t get_pstate_amd(void) 
262 {
263     struct p_state_stat_reg_amd pstat;
264
265     rdmsrl(MSR_PSTATE_STAT_REG_AMD, pstat.val);
266
267     get_cpu_var(core_state).cur_pstate=pstat.reg.pstate;
268     put_cpu_var(core_state);
269
270     return pstat.reg.pstate;
271 }
272
273
274 static void set_pstate_amd(uint64_t p)
275 {
276     struct p_state_ctl_reg_amd pctl;
277
278     if (p>get_cpu_var(core_state).max_pstate) { 
279         p=get_cpu_var(core_state).max_pstate;
280     }
281     put_cpu_var(core_state);
282
283     pctl.val = 0;
284     pctl.reg.cmd = p;
285
286     wrmsrl(MSR_PSTATE_CTL_REG_AMD, pctl.val);
287
288     get_cpu_var(core_state).cur_pstate=p;
289     put_cpu_var(core_state);
290 }
291
292
293 /*
294  * NOTE: HW may change this value at runtime
295  */
296 static uint64_t get_max_pstate_amd(void)
297 {
298     struct p_state_limit_reg_amd plimits;
299
300     rdmsrl(MSR_PSTATE_LIMIT_REG_AMD, plimits.val);
301
302     return plimits.reg.pstate_max;
303 }
304
305
306 static uint64_t get_min_pstate_amd(void)
307 {
308     struct p_state_limit_reg_amd plimits;
309
310     rdmsrl(MSR_PSTATE_LIMIT_REG_AMD, plimits.val);
311
312     return plimits.reg.pstate_limit;
313 }
314
315
316 static struct pstate_core_funcs amd_funcs =
317 {
318     .arch_init        = init_arch_amd,
319     .arch_deinit      = deinit_arch_amd,
320     .get_pstate       = get_pstate_amd,
321     .set_pstate       = set_pstate_amd,
322     .get_max_pstate   = get_max_pstate_amd,
323     .get_min_pstate   = get_min_pstate_amd,
324 };
325
326
327
328 /***********************************************************
329   INTEL DIRECT CONTROL
330  **********************************************************/
331
332
333 /*
334    This implementation uses SpeedStep, but does check
335    to see if the other features (MPERF/APERF, Turbo/IDA, HWP)
336    are available.
337 */
338
339 /* Intel System Programmer's Manual Vol. 3B, 14-2 */
340 #define MSR_MPERF_IA32         0x000000e7
341 #define MSR_APERF_IA32         0x000000e8
342 #define MSR_MISC_ENABLE_IA32   0x000001a0
343 #define MSR_NHM_TURBO_RATIO_LIMIT   0x000001ad
344 #define MSR_PLATFORM_INFO_IA32 0x000000ce
345 #define MSR_PERF_CTL_IA32      0x00000199
346 #define MSR_PERF_STAT_IA32     0x00000198
347 #define MSR_ENERY_PERF_BIAS_IA32 0x000001b0
348
349
350 /* Note that the actual  meaning of the pstate
351    in the control and status registers is actually
352    implementation dependent, unlike AMD.   The "official"
353    way to figure it out the mapping from pstate to 
354    these values is via ACPI.  What is written in the register
355    is an "id" of an operation point
356
357    "Often", the 16 bit field consists of a high order byte
358    which is the frequency (the multiplier) and the low order
359    byte is the voltage. 
360    */
361 // MSR_PERF_CTL_IA32  r/w
362 struct perf_ctl_reg_intel {
363     union {
364         uint64_t val;
365         struct {
366             // This is the target
367             // Note, not the ACPI pstate, but
368             // Intel's notion of pstate is that it's opaque
369             // for lots of implementations it seems to be
370             // frequency_id : voltage_id
371             // where frequency_id is typically the multiplier
372             uint16_t pstate                 : 16;
373             uint16_t reserved               : 16;
374             // set to 1 to *disengage* dynamic acceleration
375             // Note that "IDA" and "Turbo" use the same interface
376             uint16_t dynamic_accel_disable  : 1;
377             uint32_t reserved2              : 31;
378         } reg;
379     } __attribute__((packed));
380 } __attribute__((packed));
381
382 // MSR_PERF_STAT_IA32 r
383 struct perf_stat_reg_intel {
384     union {
385         uint64_t val;
386         struct {
387             // this is the current
388             uint16_t pstate                 : 16;
389             uint64_t reserved               : 48;
390         } reg;
391     } __attribute__((packed));
392 } __attribute__((packed));
393
394 // MSR_ENERGY_PERF_BIAS_IA32 r/w
395 struct enery_perf_bias_reg_intel {
396     union {
397         uint64_t val;
398         struct {
399             // this is the current
400             uint8_t  policy_hint            : 4;
401             uint64_t reserved               : 60;
402         } reg;
403     } __attribute__((packed));
404 } __attribute__((packed));
405
406 // MSR_PLATFORM_INFO
407 struct turbo_mode_info_reg_intel {
408     union {
409         uint64_t val;
410         struct {
411             uint8_t  rsvd0                  : 8;
412             uint8_t  max_noturbo_ratio      : 8;
413             uint8_t  rsvd1                  : 7;
414             uint8_t  ppin_cap               : 1;
415             uint8_t  rsvd2                  : 4;
416             uint8_t  ratio_limit            : 1; 
417             uint8_t  tdc_tdp_limit          : 1;
418             uint16_t rsvd3                  : 10;
419             uint8_t  min_ratio              : 8;
420             uint16_t rsvd4                  : 16;
421         } reg;
422     } __attribute__((packed));
423 } __attribute__((packed));
424
425 // This replicates the critical information in Linux's struct acpi_processor_px
426 // To make it easier to port to other OSes.    
427 struct intel_pstate_info {
428     uint64_t freq;  // KHz
429     uint64_t ctrl;  // What to write into the _CTL MSR to get this
430 };
431
432 // The internal array will be used if we cannot build the table locally
433 static struct intel_pstate_info *intel_pstate_to_ctrl_internal=0;
434 static int intel_num_pstates_internal=0;
435
436 // These will either point to the internal array or to a constructed array
437 static struct intel_pstate_info *intel_pstate_to_ctrl=0;
438 static int intel_num_pstates=0;
439
440
441 /* CPUID.01:ECX.AES(7) */
442 static uint8_t supports_pstates_intel(void)
443 {
444     /* NOTE: CPUID.06H:ECX.SETBH[bit 3] is set and it also implies the presence of a new architectural MSR called IA32_ENERGY_PERF_BIAS (1B0H).
445     */
446     uint32_t eax, ebx, ecx, edx;
447
448     cpuid(0x1, &eax, &ebx, &ecx, &edx);
449     machine_state.have_speedstep =  !!(ecx & (1 << 7));
450
451     cpuid(0x6, &eax, &ebx, &ecx, &edx);
452     machine_state.have_pstate_hw_coord =  !!(ecx & 1); // ?
453     machine_state.have_opportunistic =  !!(eax & 1<<1);
454     machine_state.have_policy_hint = !!(ecx & 1<<3);
455     machine_state.have_hwp = !!(eax & 1<<7);
456     machine_state.have_hdc = !!(eax & 1<<13);
457
458     cpuid(0x5, &eax, &ebx, &ecx, &edx);
459     machine_state.have_mwait_ext =  !!(ecx & 1);
460     machine_state.have_mwait_int =  !!(ecx & 1<<1);
461
462
463     // Note we test all the available hardware features documented as of August 2014
464     // We are only currently using speed_step, however.
465
466     INFO("P-State: Intel: Speedstep=%d, PstateHWCoord=%d, Opportunistic=%d PolicyHint=%d HWP=%d HDC=%d, MwaitExt=%d MwaitInt=%d \n",
467             machine_state.have_speedstep, 
468             machine_state.have_pstate_hw_coord, 
469             machine_state.have_opportunistic,
470             machine_state.have_policy_hint,
471             machine_state.have_hwp,
472             machine_state.have_hdc,
473             machine_state.have_mwait_ext,
474             machine_state.have_mwait_int );
475
476
477     if (machine_state.have_speedstep) {
478         uint32_t i;
479         // Build mapping table (from "pstate" (0..) to ctrl value for MSR
480         if (!(get_cpu_var(processors)) || !(get_cpu_var(processors)->performance) ) { 
481             put_cpu_var(processors);
482             // no acpi...  revert to internal table
483             intel_pstate_to_ctrl=intel_pstate_to_ctrl_internal;
484             intel_num_pstates=intel_num_pstates_internal;
485         } else {
486             intel_num_pstates = get_cpu_var(processors)->performance->state_count;
487             if (intel_num_pstates) { 
488                 intel_pstate_to_ctrl = palacios_alloc(sizeof(struct intel_pstate_info)*intel_num_pstates);
489                 if (!intel_pstate_to_ctrl) { 
490                     ERROR("P-State: Cannot allocate space for mapping...\n");
491                     intel_num_pstates=0;
492                 }
493                 for (i=0;i<intel_num_pstates;i++) { 
494                     intel_pstate_to_ctrl[i].freq = get_cpu_var(processors)->performance->states[i].core_frequency*1000;
495                     intel_pstate_to_ctrl[i].ctrl = get_cpu_var(processors)->performance->states[i].control;
496                 }
497                     
498             } else {
499                 ERROR("P-State: Strange, machine has ACPI DVFS but no states...\n");
500             }
501         }
502         put_cpu_var(processors);
503         INFO("P-State: Intel - State Mapping (%u states) follows\n",intel_num_pstates);
504         for (i=0;i<intel_num_pstates;i++) {
505             INFO("P-State: Intel Mapping %u:  freq=%llu  ctrl=%llx\n",
506                  i, intel_pstate_to_ctrl[i].freq,intel_pstate_to_ctrl[i].ctrl);
507         }
508     } else {
509         INFO("P-State: Intel:  No speedstep here\n");
510     }
511         
512
513     return machine_state.have_speedstep;
514 }
515
516
517 static void init_arch_intel(void)
518 {
519     uint64_t val;
520
521     rdmsrl(MSR_MISC_ENABLE_IA32, val);
522
523     //INFO("P-State: prior ENABLE=%llx\n",val);
524
525     // store prior speedstep setting
526     get_cpu_var(core_state).prior_speedstep=(val >> 16) & 0x1;
527     put_cpu_var(core_state);
528
529     // enable speedstep (probably already on)
530     val |= 1 << 16;
531     wrmsrl(MSR_MISC_ENABLE_IA32, val);
532
533     //INFO("P-State: write ENABLE=%llx\n",val);
534
535 }
536
537 static void deinit_arch_intel(void)
538 {
539     uint64_t val;
540
541     rdmsrl(MSR_MISC_ENABLE_IA32, val);
542
543     //INFO("P-State: deinit: ENABLE=%llx\n",val);
544
545     val &= ~(1ULL << 16);
546     val |= get_cpu_var(core_state).prior_speedstep << 16;
547     put_cpu_var(core_state);
548
549     wrmsrl(MSR_MISC_ENABLE_IA32, val);
550
551     //INFO("P-state: deinit ENABLE=%llx\n",val);
552
553 }
554
555 /* TODO: Intel P-states require sampling at intervals... */
556 static uint64_t get_pstate_intel(void)
557 {
558     uint64_t val;
559
560     rdmsrl(MSR_PERF_STAT_IA32,val);
561
562     //INFO("P-State: Get: 0x%llx\n", val);
563
564     // should check if turbo is active, in which case 
565     // this value is not the whole story
566
567     return val;
568 }
569
570 static void set_pstate_intel(uint64_t p)
571 {
572     uint64_t val;
573     uint64_t ctrl;
574
575     if (intel_num_pstates==0) { 
576         return ;
577     } else {
578         if (p>=intel_num_pstates) { 
579             p=intel_num_pstates-1;
580         }
581     }
582
583     ctrl=intel_pstate_to_ctrl[p].ctrl;
584
585     /* ...Intel IDA (dynamic acceleration)
586        if (c->no_turbo && !c->turbo_disabled) {
587        val |= 1 << 32;
588        }
589        */
590     // leave all bits along expect for the likely
591     // fid bits
592
593     rdmsrl(MSR_PERF_CTL_IA32, val);
594     //INFO("P-State: Pre-Set: 0x%llx\n", val);
595
596     val &= ~0xffffULL;
597     val |= ctrl & 0xffffULL;
598
599     //INFO("P-State: Set: 0x%llx\n", val);
600
601     wrmsrl(MSR_PERF_CTL_IA32, val);
602
603     get_cpu_var(core_state).cur_pstate = p;
604     put_cpu_var(core_state);
605 }
606
607
608 static uint64_t get_min_pstate_intel(void)
609 {
610     return 0;
611 }
612
613
614
615 static uint64_t get_max_pstate_intel (void)
616 {
617     if (intel_num_pstates==0) { 
618         return 0;
619     } else {
620         return intel_num_pstates-1;
621     }
622 }
623
624 static struct pstate_core_funcs intel_funcs =
625 {
626     .arch_init        = init_arch_intel,
627     .arch_deinit      = deinit_arch_intel,
628     .get_pstate       = get_pstate_intel,
629     .set_pstate       = set_pstate_intel,
630     .get_max_pstate   = get_max_pstate_intel,
631     .get_min_pstate   = get_min_pstate_intel,
632 };
633
634
635
636 /***********************************************
637   Arch determination and setup
638  ***********************************************/
639
640 static inline void cpuid_string (uint32_t id, uint32_t dest[4]) 
641 {
642     asm volatile("cpuid"
643             :"=a"(*dest),"=b"(*(dest+1)),"=c"(*(dest+2)),"=d"(*(dest+3))
644             :"a"(id));
645 }
646
647
648 static int get_cpu_vendor (char name[13])
649 {
650     uint32_t dest[4];
651     uint32_t maxid;
652
653     cpuid_string(0,dest);
654     maxid=dest[0];
655     ((uint32_t*)name)[0]=dest[1];
656     ((uint32_t*)name)[1]=dest[3];
657     ((uint32_t*)name)[2]=dest[2];
658     name[12]=0;
659
660     return maxid;
661 }
662
663
664 static int is_intel (void)
665 {
666     char name[13];
667     get_cpu_vendor(name);
668     return !strcmp(name,"GenuineIntel");
669 }
670
671
672 static int is_amd (void)
673 {
674     char name[13];
675     get_cpu_vendor(name);
676     return !strcmp(name,"AuthenticAMD");
677 }
678
679 static int pstate_arch_setup(void)
680 {
681
682     if (is_amd()) {
683         machine_state.arch = AMD;
684         machine_state.funcs = &amd_funcs;
685         machine_state.supports_pstates = supports_pstates_amd();
686         INFO("PSTATE: P-State initialized for AMD\n");
687     } else if (is_intel()) {
688         machine_state.arch  = INTEL;
689         machine_state.funcs = &intel_funcs;
690         machine_state.supports_pstates = supports_pstates_intel();
691         INFO("PSTATE: P-State initialized for INTEL (Work in progress...)\n");
692         return 0;
693
694     } else {
695         machine_state.arch = OTHER;
696         machine_state.funcs = NULL;
697         machine_state.supports_pstates = 0;
698         INFO("PSTATE: P-state control: No support for direct control on this architecture\n");
699         return 0;
700     }
701
702     return 0;
703 }
704
705
706
707 /******************************************************************
708   Linux Interface
709  *****************************************************************/
710
711 static unsigned cpus_using_v3_governor;
712 static DEFINE_MUTEX(v3_governor_mutex);
713
714 /* KCH: this will tell us when there is an actual frequency transition */
715 static int v3_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
716         void *data)
717 {
718     struct cpufreq_freqs *freq = data;
719
720     if (per_cpu(core_state, freq->cpu).mode != V3_PSTATE_EXTERNAL_CONTROL) {
721         return 0;
722     }
723
724     if (val == CPUFREQ_POSTCHANGE) {
725         DEBUG("P-State: frequency change took effect on cpu %u (now %u kHz)\n",
726                 freq->cpu, freq->new);
727         per_cpu(core_state, freq->cpu).cur_freq_khz = freq->new;
728     }
729
730     return 0;
731
732 }
733
734
735 static struct notifier_block v3_cpufreq_notifier_block = {
736     .notifier_call = v3_cpufreq_notifier
737 };
738
739
740 /* 
741  * This stub governor is simply a placeholder for preventing 
742  * frequency changes from the Linux side. For now, we simply leave
743  * the frequency as is when we acquire control. 
744  */
745 static int governor_run(struct cpufreq_policy *policy, unsigned int event)
746 {
747     unsigned cpu = policy->cpu;
748
749     switch (event) {
750         /* we can't use cpufreq_driver_target here as it can result
751          * in a circular dependency, so we'll keep the current frequency as is
752          */
753         case CPUFREQ_GOV_START:
754             BUG_ON(!policy->cur);
755
756             mutex_lock(&v3_governor_mutex);
757
758             if (cpus_using_v3_governor == 0) {
759                 cpufreq_register_notifier(&v3_cpufreq_notifier_block,
760                         CPUFREQ_TRANSITION_NOTIFIER);
761             }
762
763             cpus_using_v3_governor++;
764
765             per_cpu(core_state, cpu).set_freq_khz = policy->cur;
766             per_cpu(core_state, cpu).cur_freq_khz = policy->cur;
767             per_cpu(core_state, cpu).max_freq_khz = policy->max;
768             per_cpu(core_state, cpu).min_freq_khz = policy->min;
769
770             mutex_unlock(&v3_governor_mutex);
771             break;
772         case CPUFREQ_GOV_STOP:
773             mutex_lock(&v3_governor_mutex);
774
775             cpus_using_v3_governor--;
776
777             if (cpus_using_v3_governor == 0) {
778                 cpufreq_unregister_notifier(
779                         &v3_cpufreq_notifier_block,
780                         CPUFREQ_TRANSITION_NOTIFIER);
781             }
782
783             per_cpu(core_state, cpu).set_freq_khz = 0;
784             per_cpu(core_state, cpu).cur_freq_khz = 0;
785             per_cpu(core_state, cpu).max_freq_khz = 0;
786             per_cpu(core_state, cpu).min_freq_khz = 0;
787
788             mutex_unlock(&v3_governor_mutex);
789             break;
790         case CPUFREQ_GOV_LIMITS:
791             /* do nothing */
792             break;
793         default:
794             ERROR("Undefined governor command (%u)\n", event);
795             return -1;
796     }                           
797
798     return 0;
799 }
800
801
802 static struct cpufreq_governor stub_governor = 
803 {
804     .name = PALACIOS_GOVNAME,
805     .governor = governor_run,
806     .owner = THIS_MODULE,
807 };
808
809
810 static struct workqueue_struct *pstate_wq;
811
812 typedef struct {
813     struct work_struct work;
814     uint64_t freq;
815 } pstate_work_t;
816
817
818
819 static inline void pstate_register_linux_governor(void)
820 {
821     cpufreq_register_governor(&stub_governor);
822 }
823
824
825 static inline void pstate_unregister_linux_governor(void)
826 {
827     cpufreq_unregister_governor(&stub_governor);
828 }
829
830
831 static int pstate_linux_init(void)
832 {
833     pstate_register_linux_governor();
834     pstate_wq = create_workqueue("v3vee_pstate_wq");
835     if (!pstate_wq) {
836         ERROR("Could not create work queue\n");
837         goto out_err;
838     }
839
840     return 0;
841
842 out_err:
843     pstate_unregister_linux_governor();
844     return -1;
845 }
846
847
848 static void pstate_linux_deinit(void)
849 {
850     pstate_unregister_linux_governor();
851     flush_workqueue(pstate_wq);
852     destroy_workqueue(pstate_wq);
853 }
854
855
856 static int get_current_governor(char **buf, unsigned int cpu)
857 {
858     struct cpufreq_policy * policy = palacios_alloc(sizeof(struct cpufreq_policy));
859     char * govname = NULL;
860
861     if (!policy) {
862         ERROR("could not allocate cpufreq_policy\n");
863         return -1;
864     }
865         
866     if (cpufreq_get_policy(policy, cpu) != 0) {
867         ERROR("Could not get current cpufreq policy\n");
868         goto out_err;
869     }
870
871     /* We're in interrupt context, should probably not wait here */
872     govname = palacios_alloc(MAX_GOV_NAME_LEN);
873     if (!govname) {
874         ERROR("Could not allocate space for governor name\n");
875         goto out_err;
876     }
877
878     strncpy(govname, policy->governor->name, MAX_GOV_NAME_LEN);
879     govname[MAX_GOV_NAME_LEN-1] = 0;
880
881     get_cpu_var(core_state).linux_governor = govname;
882     put_cpu_var(core_state);
883
884     *buf = govname;
885
886     palacios_free(policy);
887
888     return 0;
889
890 out_err:
891     palacios_free(policy);
892     return -1;
893 }
894
895
896 /* passed to the userspacehelper interface for cleanup */
897 static void gov_switch_cleanup(struct subprocess_info * s)
898 {
899     palacios_free(s->argv[2]);
900     palacios_free(s->argv);
901 }
902
903
904 /* 
905  * Switch governors
906  * @s - the governor to switch to 
907  * TODO: this should probably be submitted to a work queue
908  * so we don't have to run it in interrupt context
909  */
910 static int governor_switch(char * s, unsigned int cpu)
911 {
912     char * path_str = NULL;
913     char ** argv = NULL; 
914
915     static char * envp[] = {
916         "HOME=/",
917         "TERM=linux",
918         "PATH=/sbin:/bin:/usr/sbin:/usr/bin", NULL };
919
920
921     argv = palacios_alloc(4*sizeof(char*));
922     if (!argv) {
923         ERROR("Couldn't allocate argv struct\n");
924         return -1;
925     }
926
927     path_str = palacios_alloc(MAX_PATH_LEN);
928     if (!path_str) {
929         ERROR("Couldn't allocate path string\n");
930         goto out_freeargv;
931     }
932     memset(path_str, 0, MAX_PATH_LEN);
933
934     snprintf(path_str, MAX_PATH_LEN, "echo %s > /sys/devices/system/cpu/cpu%u/cpufreq/scaling_governor", s, cpu);
935
936     argv[0] = "/bin/sh";
937     argv[1] = "-c";
938     argv[2] = path_str;
939     argv[3] = NULL;
940
941     /* KCH: we can't wait here to actually see if we succeeded, we're in interrupt context */
942
943 #if LINUX_VERSION_CODE <= KERNEL_VERSION(3,9,0)
944     return call_usermodehelper_fns("/bin/sh", argv, envp, UMH_NO_WAIT, NULL, gov_switch_cleanup, NULL);
945 #else
946     {
947       struct subprocess_info *sp;
948       
949       sp = call_usermodehelper_setup("/bin/sh", argv, envp, GFP_ATOMIC, NULL, gov_switch_cleanup, NULL);
950       if (!sp) { 
951         goto out_freeargv;
952       }
953       
954       return call_usermodehelper_exec(sp,0);
955     }
956 #endif
957       
958 out_freeargv:
959     palacios_free(argv);
960     return -1;
961 }
962
963
964 static inline void free_linux_governor(void)
965 {
966     palacios_free(get_cpu_var(core_state).linux_governor);
967     put_cpu_var(core_state);
968 }
969
970
971 static int linux_setup_palacios_governor(void)
972 {
973     char * gov;
974     unsigned int cpu = get_cpu();
975     put_cpu();
976
977     /* KCH:  we assume the v3vee governor is already 
978      * registered with kernel by this point 
979      */
980
981     if (get_current_governor(&gov, cpu) < 0) {
982         ERROR("Could not get current governor\n");
983         return -1;
984     }
985
986     DEBUG("saving current governor (%s)\n", gov);
987
988     get_cpu_var(core_state).linux_governor = gov;
989     put_cpu_var(core_state);
990     
991     DEBUG("setting the new governor (%s)\n", PALACIOS_GOVNAME);
992
993     /* set the new one to ours */
994
995     if (governor_switch(PALACIOS_GOVNAME, cpu) < 0) {
996         ERROR("Could not set governor to (%s)\n", PALACIOS_GOVNAME);
997         return -1;
998     }
999
1000     return 0;
1001 }
1002
1003
1004
1005 static uint64_t linux_get_pstate(void)
1006 {
1007     struct cpufreq_policy * policy = NULL;
1008     struct cpufreq_frequency_table *table;
1009     unsigned int i = 0;
1010     unsigned int count = 0;
1011     unsigned int cpu = get_cpu(); 
1012     put_cpu();
1013
1014
1015     policy = palacios_alloc(sizeof(struct cpufreq_policy));
1016     if (!policy) {
1017         ERROR("Could not allocate policy struct\n");
1018         return -1;
1019     }
1020
1021     cpufreq_get_policy(policy, cpu);
1022     table = cpufreq_frequency_get_table(cpu);
1023
1024     for (i = 0; (table[i].frequency != CPUFREQ_TABLE_END); i++) {
1025
1026         if (table[i].frequency == CPUFREQ_ENTRY_INVALID) {
1027             continue;
1028         }
1029
1030         if (table[i].frequency == policy->cur) {
1031             break;
1032         }
1033
1034         count++;
1035     }
1036
1037     palacios_free(policy);
1038
1039     put_cpu();
1040     return count;
1041 }
1042
1043
1044 static uint64_t linux_get_freq(void)
1045 {
1046     uint64_t freq;
1047     struct cpufreq_policy * policy = NULL;
1048     unsigned int cpu = get_cpu();
1049     put_cpu();
1050
1051     policy = palacios_alloc(sizeof(struct cpufreq_policy));
1052     if (!policy) {
1053         ERROR("Could not allocate policy struct\n");
1054         return -1;
1055     }
1056
1057     if (cpufreq_get_policy(policy, cpu)) {
1058         ERROR("Could not get current policy\n");
1059         return -1;
1060     }
1061
1062     freq=policy->cur;
1063
1064     palacios_free(policy);
1065
1066     return freq;
1067 }
1068
1069 static void  
1070 pstate_switch_workfn (struct work_struct *work)
1071 {
1072     pstate_work_t * pwork = (pstate_work_t*)work;
1073     struct cpufreq_policy * policy = NULL;
1074     uint64_t freq; 
1075     unsigned int cpu = get_cpu();
1076     put_cpu();
1077
1078     mutex_lock(&v3_governor_mutex);
1079
1080     policy = palacios_alloc(sizeof(struct cpufreq_policy));
1081     if (!policy) {
1082         ERROR("Could not allocate space for cpufreq policy\n");
1083         goto out;
1084     }
1085
1086     if (cpufreq_get_policy(policy, cpu) != 0) {
1087         ERROR("Could not get cpufreq policy\n");
1088         goto out1;
1089     }
1090
1091     freq = pwork->freq;
1092     get_cpu_var(core_state).set_freq_khz = freq;
1093
1094     if (freq < get_cpu_var(core_state).min_freq_khz) {
1095         freq = get_cpu_var(core_state).min_freq_khz;
1096     }
1097     if (freq > get_cpu_var(core_state).max_freq_khz) {
1098         freq = get_cpu_var(core_state).max_freq_khz;
1099     }
1100     put_cpu_var(core_state);
1101
1102     INFO("P-state: requesting frequency change on core %u to %llu\n", cpu, freq);
1103     __cpufreq_driver_target(policy, freq, CPUFREQ_RELATION_L);
1104
1105 out1:
1106     palacios_free(policy);
1107 out:
1108     palacios_free(work);
1109     mutex_unlock(&v3_governor_mutex);
1110
1111
1112
1113 static int linux_set_pstate(uint64_t p)
1114 {
1115     struct cpufreq_policy * policy = NULL;
1116     struct cpufreq_frequency_table *table;
1117     pstate_work_t * work = NULL;
1118     unsigned int i = 0;
1119     unsigned int count = 0;
1120     int state_set = 0;
1121     int last_valid = 0;
1122     unsigned int cpu = get_cpu();
1123     put_cpu();
1124
1125     policy = palacios_alloc(sizeof(struct cpufreq_policy));
1126     if (!policy) {
1127         ERROR("Could not allocate policy struct\n");
1128         return -1;
1129     }
1130
1131     work = (pstate_work_t*)palacios_alloc(sizeof(pstate_work_t));
1132     if (!work) {
1133         ERROR("Could not allocate work struct\n");
1134         goto out_err;
1135     }
1136
1137     if (cpufreq_get_policy(policy, cpu)) {
1138         ERROR("Could not get current policy\n");
1139         goto out_err1;
1140     }
1141     table = cpufreq_frequency_get_table(cpu);
1142
1143     for (i = 0; (table[i].frequency != CPUFREQ_TABLE_END); i++) {
1144
1145         if (table[i].frequency == CPUFREQ_ENTRY_INVALID) {
1146             continue;
1147         }
1148
1149         if (count == p) {
1150
1151             INIT_WORK((struct work_struct*)work, pstate_switch_workfn);
1152             work->freq = table[i].frequency;
1153             queue_work(pstate_wq, (struct work_struct*)work);
1154
1155             state_set = 1;
1156             break;
1157         }
1158
1159         count++;
1160         last_valid = i;
1161     }
1162
1163     /* we need to deal with the case in which we get a number > max pstate */
1164     if (!state_set) {
1165         INIT_WORK((struct work_struct*)work, pstate_switch_workfn);
1166         work->freq = table[last_valid].frequency;
1167         queue_work(pstate_wq, (struct work_struct*)work);
1168     }
1169
1170     palacios_free(policy);
1171     return 0;
1172
1173 out_err1: 
1174     palacios_free(work);
1175 out_err:
1176     palacios_free(policy);
1177     return -1;
1178 }
1179
1180
1181 static int linux_set_freq(uint64_t f)
1182 {
1183     struct cpufreq_policy * policy = NULL;
1184     pstate_work_t * work = NULL;
1185     uint64_t freq;
1186     unsigned int cpu = get_cpu();
1187     put_cpu();
1188
1189     policy = palacios_alloc(sizeof(struct cpufreq_policy));
1190     if (!policy) {
1191         ERROR("Could not allocate policy struct\n");
1192         return -1;
1193     }
1194
1195     work = (pstate_work_t*)palacios_alloc(sizeof(pstate_work_t));
1196     if (!work) {
1197         ERROR("Could not allocate work struct\n");
1198         goto out_err;
1199     }
1200
1201     if (cpufreq_get_policy(policy, cpu) != 0) {
1202         ERROR("Could not get cpufreq policy\n");
1203         goto out_err1;
1204     }
1205
1206     if (f < policy->min) {
1207         freq = policy->min;
1208     } else if (f > policy->max) {
1209         freq = policy->max;
1210     } else {
1211         freq = f;
1212     }
1213
1214     INIT_WORK((struct work_struct*)work, pstate_switch_workfn);
1215     work->freq = freq;
1216     queue_work(pstate_wq, (struct work_struct*)work);
1217
1218     palacios_free(policy);
1219     return 0;
1220
1221 out_err1:
1222     palacios_free(work);
1223 out_err:
1224     palacios_free(policy);
1225     return -1;
1226 }
1227
1228
1229 static int linux_restore_defaults(void)
1230 {
1231     char * gov = NULL;
1232     unsigned int cpu = get_cpu();
1233     put_cpu();
1234
1235     gov = get_cpu_var(core_state).linux_governor;
1236     put_cpu_var(core_state);
1237
1238     DEBUG("restoring previous governor (%s)\n", gov);
1239
1240     if (governor_switch(gov, cpu) < 0) {
1241         ERROR("Could not restore governor to (%s)\n", gov);
1242         goto out_err;
1243     }
1244
1245     free_linux_governor();
1246     return 0;
1247
1248 out_err:
1249     free_linux_governor();
1250     return -1;
1251 }
1252
1253
1254
1255 /******************************************************************
1256   Generic Interface as provided to Palacios and to the rest of the
1257   module
1258  ******************************************************************/
1259
1260 static void init_core(void)
1261 {
1262     unsigned cpu;
1263     struct cpufreq_policy *p;
1264
1265
1266     //DEBUG("P-State Core Init\n");
1267
1268     get_cpu_var(core_state).mode = V3_PSTATE_HOST_CONTROL;
1269     get_cpu_var(core_state).cur_pstate = 0;
1270
1271     if (machine_state.funcs) {
1272         get_cpu_var(core_state).min_pstate = machine_state.funcs->get_min_pstate();
1273         get_cpu_var(core_state).max_pstate = machine_state.funcs->get_max_pstate();
1274     } else {
1275         get_cpu_var(core_state).min_pstate = 0;
1276         get_cpu_var(core_state).max_pstate = 0;
1277     }
1278
1279
1280     cpu = get_cpu(); put_cpu();
1281
1282     p = cpufreq_cpu_get(cpu);
1283
1284     if (!p) { 
1285         get_cpu_var(core_state).have_cpufreq = 0;
1286         get_cpu_var(core_state).min_freq_khz=0;
1287         get_cpu_var(core_state).max_freq_khz=0;
1288         get_cpu_var(core_state).cur_freq_khz=0;
1289     } else {
1290         get_cpu_var(core_state).have_cpufreq = 1;
1291         get_cpu_var(core_state).min_freq_khz=p->min;
1292         get_cpu_var(core_state).max_freq_khz=p->max;
1293         get_cpu_var(core_state).cur_freq_khz=p->cur; } cpufreq_cpu_put(p); 
1294     put_cpu_var(core_state);
1295
1296     /*
1297     for (i=0;i<get_cpu_var(processors)->performance->state_count; i++) { 
1298         INFO("P-State: %u: freq=%llu ctrl=%llx",
1299                 i, 
1300                 get_cpu_var(processors)->performance->states[i].core_frequency*1000,
1301                 get_cpu_var(processors)->performance->states[i].control);
1302    }
1303    put_cpu_var(processors);
1304     */
1305 }
1306
1307
1308 void palacios_pstate_ctrl_release(void);
1309
1310
1311 static void deinit_core(void)
1312 {
1313     DEBUG("P-State Core Deinit\n");
1314     palacios_pstate_ctrl_release();
1315
1316 }
1317
1318
1319
1320 void palacios_pstate_ctrl_get_chars(struct v3_cpu_pstate_chars *c) 
1321 {
1322     memset(c,0,sizeof(struct v3_cpu_pstate_chars));
1323
1324
1325     c->features = V3_PSTATE_INTERNAL_CONTROL;
1326
1327     if (get_cpu_var(core_state).have_cpufreq) {
1328         c->features |= V3_PSTATE_EXTERNAL_CONTROL;
1329     }
1330
1331     if (machine_state.arch==AMD || machine_state.arch==INTEL) { 
1332         c->features |= V3_PSTATE_DIRECT_CONTROL;
1333     }
1334     c->cur_mode = get_cpu_var(core_state).mode;
1335     c->min_pstate = get_cpu_var(core_state).min_pstate;
1336     c->max_pstate = get_cpu_var(core_state).max_pstate;
1337     c->cur_pstate = get_cpu_var(core_state).cur_pstate;
1338     c->min_freq_khz = get_cpu_var(core_state).min_freq_khz;
1339     c->max_freq_khz = get_cpu_var(core_state).max_freq_khz;
1340     c->cur_freq_khz = get_cpu_var(core_state).cur_freq_khz;
1341
1342     put_cpu_var(core_state);
1343
1344
1345
1346 }
1347
1348
1349 uint64_t palacios_pstate_ctrl_get_pstate(void)
1350 {
1351     if (get_cpu_var(core_state).mode==V3_PSTATE_DIRECT_CONTROL) { 
1352         put_cpu_var(core_state);
1353         return machine_state.funcs->get_pstate();
1354     } else if (get_cpu_var(core_state).mode==V3_PSTATE_EXTERNAL_CONTROL) {
1355         put_cpu_var(core_state);
1356         return linux_get_pstate();
1357     } else {
1358         put_cpu_var(core_state);
1359         return 0;
1360     }
1361 }
1362
1363
1364 void palacios_pstate_ctrl_set_pstate(uint64_t p)
1365 {
1366     if (get_cpu_var(core_state).mode==V3_PSTATE_DIRECT_CONTROL) { 
1367         put_cpu_var(core_state);
1368         machine_state.funcs->set_pstate(p);
1369     } else if (get_cpu_var(core_state).mode==V3_PSTATE_EXTERNAL_CONTROL) {
1370         put_cpu_var(core_state);
1371         linux_set_pstate(p);
1372     } else {
1373         put_cpu_var(core_state);
1374     }
1375 }
1376
1377
1378 void palacios_pstate_ctrl_set_pstate_wrapper(void *p)
1379 {
1380     palacios_pstate_ctrl_set_pstate((uint8_t)(uint64_t)p);
1381 }
1382
1383
1384 uint64_t palacios_pstate_ctrl_get_freq(void)
1385 {
1386     if (get_cpu_var(core_state).mode==V3_PSTATE_EXTERNAL_CONTROL) { 
1387         put_cpu_var(core_state);
1388         return linux_get_freq();
1389     } else {
1390         put_cpu_var(core_state);
1391         return 0;
1392     }
1393 }
1394
1395
1396 void palacios_pstate_ctrl_set_freq(uint64_t p)
1397 {
1398     if (get_cpu_var(core_state).mode==V3_PSTATE_EXTERNAL_CONTROL) { 
1399         put_cpu_var(core_state);
1400         linux_set_freq(p);
1401     } else {
1402         put_cpu_var(core_state);
1403     }
1404 }
1405
1406
1407 static int switch_to_external(void)
1408 {
1409     DEBUG("switch from host control to external\n");
1410
1411     if (!(get_cpu_var(core_state).have_cpufreq)) {
1412         put_cpu_var(core_state);
1413         ERROR("No cpufreq  - cannot switch to external...\n");
1414         return -1;
1415     } 
1416     put_cpu_var(core_state);
1417
1418     linux_setup_palacios_governor();
1419
1420     get_cpu_var(core_state).mode=V3_PSTATE_EXTERNAL_CONTROL;
1421     put_cpu_var(core_state);
1422
1423     return 0;
1424 }
1425
1426
1427 static int switch_to_direct(void)
1428 {
1429     DEBUG("switch from host control to direct\n");
1430
1431     if (get_cpu_var(core_state).have_cpufreq) { 
1432         put_cpu_var(core_state);
1433         DEBUG("switch to direct from cpufreq\n");
1434
1435         // The implementation would set the policy and governor to peg cpu
1436         // regardless of load
1437         linux_setup_palacios_governor();
1438     } else {
1439         put_cpu_var(core_state);
1440     }
1441
1442     if (machine_state.funcs && machine_state.funcs->arch_init) {
1443         get_cpu_var(core_state).mode=V3_PSTATE_DIRECT_CONTROL;
1444
1445         machine_state.funcs->arch_init();
1446
1447         put_cpu_var(core_state);
1448     }
1449
1450     return 0;
1451 }
1452
1453
1454 static int switch_to_internal(void)
1455 {
1456     DEBUG("switch from host control to internal\n");
1457
1458     if (get_cpu_var(core_state).have_cpufreq) { 
1459         put_cpu_var(core_state);
1460         DEBUG("switch to internal on machine with cpu freq\n");
1461         linux_setup_palacios_governor();
1462     } else {
1463         put_cpu_var(core_state);
1464     }
1465
1466     get_cpu_var(core_state).mode=V3_PSTATE_INTERNAL_CONTROL;
1467
1468     put_cpu_var(core_state);
1469
1470     return 0;
1471 }
1472
1473
1474 static int switch_from_external(void)
1475 {
1476     if (!(get_cpu_var(core_state).have_cpufreq)) {
1477         put_cpu_var(core_state);
1478         ERROR("No cpufreq  - how did we get here... external...\n");
1479         return -1;
1480     }
1481     put_cpu_var(core_state);
1482
1483     DEBUG("Switching back to host control from external\n");
1484
1485     if (get_cpu_var(core_state).have_cpufreq) { 
1486         put_cpu_var(core_state);
1487         linux_restore_defaults();
1488     } else {
1489         put_cpu_var(core_state);
1490     }
1491
1492     get_cpu_var(core_state).mode = V3_PSTATE_HOST_CONTROL;
1493     put_cpu_var(core_state);
1494
1495     return 0;
1496 }
1497
1498
1499 static int switch_from_direct(void)
1500 {
1501
1502     DEBUG("Switching back to host control from direct\n");
1503
1504     // Set maximum performance, just in case there is no host control
1505     machine_state.funcs->set_pstate(get_cpu_var(core_state).min_pstate);
1506     machine_state.funcs->arch_deinit();
1507
1508     if (get_cpu_var(core_state).have_cpufreq) { 
1509         put_cpu_var(core_state);
1510         linux_restore_defaults();
1511     } else {
1512         put_cpu_var(core_state);
1513     }
1514
1515     get_cpu_var(core_state).mode=V3_PSTATE_HOST_CONTROL;
1516
1517     put_cpu_var(core_state);
1518
1519     return 0;
1520 }
1521
1522
1523 static int switch_from_internal(void)
1524 {
1525     DEBUG("Switching back to host control from internal\n");
1526
1527     if (get_cpu_var(core_state).have_cpufreq) { 
1528         put_cpu_var(core_state);
1529         linux_restore_defaults();
1530     } else {
1531         put_cpu_var(core_state);
1532     }
1533
1534     get_cpu_var(core_state).mode=V3_PSTATE_HOST_CONTROL;
1535
1536     put_cpu_var(core_state);
1537
1538     return 0;
1539 }
1540
1541
1542
1543 void palacios_pstate_ctrl_acquire(uint32_t type)
1544 {
1545     if (get_cpu_var(core_state).mode != V3_PSTATE_HOST_CONTROL) { 
1546         put_cpu_var(core_state);
1547         palacios_pstate_ctrl_release();
1548     } else {
1549         put_cpu_var(core_state);
1550     }
1551
1552     switch (type) { 
1553         case V3_PSTATE_EXTERNAL_CONTROL:
1554             switch_to_external();
1555             break;
1556         case V3_PSTATE_DIRECT_CONTROL:
1557             switch_to_direct();
1558             break;
1559         case V3_PSTATE_INTERNAL_CONTROL:
1560             switch_to_internal();
1561             break;
1562         default:
1563             ERROR("Unknown pstate control type %u\n",type);
1564             break;
1565     }
1566
1567 }
1568
1569 // Wrappers for xcalls
1570 static void palacios_pstate_ctrl_acquire_external(void)
1571 {
1572     palacios_pstate_ctrl_acquire(V3_PSTATE_EXTERNAL_CONTROL);
1573 }
1574
1575 static void palacios_pstate_ctrl_acquire_direct(void)
1576 {
1577     palacios_pstate_ctrl_acquire(V3_PSTATE_DIRECT_CONTROL);
1578 }
1579
1580
1581 void palacios_pstate_ctrl_release(void)
1582 {
1583     if (get_cpu_var(core_state).mode == V3_PSTATE_HOST_CONTROL) { 
1584         put_cpu_var(core_state);
1585         return;
1586     } 
1587     put_cpu_var(core_state);
1588
1589     switch (get_cpu_var(core_state).mode) { 
1590         case V3_PSTATE_EXTERNAL_CONTROL:
1591             put_cpu_var(core_state);
1592             switch_from_external();
1593             break;
1594         case V3_PSTATE_DIRECT_CONTROL:
1595             put_cpu_var(core_state);
1596             switch_from_direct();
1597             break;
1598         case V3_PSTATE_INTERNAL_CONTROL:
1599             put_cpu_var(core_state);
1600             switch_from_internal();
1601             break;
1602         default:
1603             put_cpu_var(core_state);
1604             ERROR("Unknown pstate control type %u\n",core_state.mode);
1605             break;
1606     }
1607 }
1608
1609
1610 static void update_hw_pstate(void *arg)
1611 {
1612     if (machine_state.funcs && machine_state.funcs->get_pstate) {
1613         get_cpu_var(core_state).cur_hw_pstate = machine_state.funcs->get_pstate();
1614         put_cpu_var(core_state);
1615     } else {
1616         get_cpu_var(core_state).cur_hw_pstate = 0;
1617         put_cpu_var(core_state);
1618     }
1619 }
1620
1621
1622 /***************************************************************************
1623   PROC Interface to expose state
1624  ***************************************************************************/
1625
1626 static int pstate_show(struct seq_file * file, void * v)
1627 {
1628     unsigned int cpu;
1629     unsigned int numcpus = num_online_cpus();
1630
1631     seq_printf(file, "V3VEE DVFS Status\n\n");
1632
1633     for (cpu=0;cpu<numcpus;cpu++) { 
1634         palacios_xcall(cpu,update_hw_pstate,0);
1635     }
1636
1637     for (cpu=0;cpu<numcpus;cpu++) { 
1638         struct pstate_core_info *s = &per_cpu(core_state,cpu);
1639         seq_printf(file,"pcore %u: hw pstate 0x%llx mode %s ",cpu,
1640                 s->cur_hw_pstate,
1641                 s->mode==V3_PSTATE_HOST_CONTROL ? "host" :
1642                 s->mode==V3_PSTATE_EXTERNAL_CONTROL ? "external" :
1643                 s->mode==V3_PSTATE_DIRECT_CONTROL ? "direct" : 
1644                 s->mode==V3_PSTATE_INTERNAL_CONTROL ? "internal" : "UNKNOWN");
1645         if (s->mode==V3_PSTATE_EXTERNAL_CONTROL) { 
1646             seq_printf(file,"(min=%llu max=%llu cur=%llu) ", s->min_freq_khz, s->max_freq_khz, s->cur_freq_khz);
1647         } 
1648         if (s->mode==V3_PSTATE_DIRECT_CONTROL) { 
1649             seq_printf(file,"(min=%llu max=%llu cur=%llu) ",s->min_pstate, s->max_pstate, s->cur_pstate);
1650         }
1651         seq_printf(file,"\n");
1652     }
1653     return 0;
1654 }
1655
1656 static int pstate_open(struct inode * inode, struct file * file) 
1657 {
1658     return single_open(file, pstate_show, NULL);
1659 }
1660
1661
1662 static struct file_operations pstate_fops = {
1663     .owner = THIS_MODULE,
1664     .open = pstate_open, 
1665     .read = seq_read,
1666     .llseek = seq_lseek,
1667     .release = seq_release
1668 };
1669
1670 static int pstate_hw_show(struct seq_file * file, void * v)
1671 {
1672     int numstates;
1673
1674     seq_printf(file, "V3VEE DVFS Hardware Info\n(all logical cores assumed identical)\n\n");
1675
1676     seq_printf(file, "Arch:   \t%s\n"
1677                      "PStates:\t%s\n\n",
1678             machine_state.arch==INTEL ? "Intel" : 
1679             machine_state.arch==AMD ? "AMD" : "Other",
1680             machine_state.supports_pstates ? "Yes" : "No");
1681
1682
1683 #define YN(x) ((x) ? "Y" : "N")
1684
1685     if (machine_state.arch==INTEL) {
1686         seq_printf(file,"SpeedStep:           \t%s\n",YN(machine_state.have_speedstep));
1687         seq_printf(file,"APERF/MPERF:         \t%s\n",YN(machine_state.have_pstate_hw_coord));
1688         seq_printf(file,"IDA or TurboCore:    \t%s\n",YN(machine_state.have_opportunistic));
1689         seq_printf(file,"Policy Hint:         \t%s\n",YN(machine_state.have_policy_hint));
1690         seq_printf(file,"Hardware Policy:     \t%s\n",YN(machine_state.have_hwp));
1691         seq_printf(file,"Hardware Duty Cycle: \t%s\n",YN(machine_state.have_hdc));
1692         seq_printf(file,"MWAIT extensions:    \t%s\n",YN(machine_state.have_mwait_ext));
1693         seq_printf(file,"MWAIT wake on intr:  \t%s\n",YN(machine_state.have_mwait_int));
1694     } 
1695
1696     if (machine_state.arch==AMD) { 
1697         seq_printf(file,"PState:              \t%s\n",YN(machine_state.have_pstate));
1698         seq_printf(file,"APERF/MPERF:         \t%s\n",YN(machine_state.have_pstate_hw_coord));
1699         seq_printf(file,"CoreBoost:           \t%s\n",YN(machine_state.have_coreboost));
1700         seq_printf(file,"Feedback:            \t%s\n",YN(machine_state.have_feedback));
1701     }
1702
1703
1704     seq_printf(file,"\nPstate\tCtrl\tKHz\tmW\tuS(X)\tuS(B)\n");
1705     numstates = get_cpu_var(processors)->performance->state_count;
1706     if (!numstates) { 
1707         seq_printf(file,"UNKNOWN\n");
1708     } else {
1709         int i;
1710         for (i=0;i<numstates;i++) { 
1711             seq_printf(file,
1712                        "%u\t%llx\t%llu\t%llu\t%llu\t%llu\n",
1713                        i, 
1714                        get_cpu_var(processors)->performance->states[i].control,
1715                        get_cpu_var(processors)->performance->states[i].core_frequency*1000,
1716                        get_cpu_var(processors)->performance->states[i].power,
1717                        get_cpu_var(processors)->performance->states[i].transition_latency,
1718                        get_cpu_var(processors)->performance->states[i].bus_master_latency);
1719         }
1720     }
1721     put_cpu_var(processors);
1722
1723     seq_printf(file,"\nAvailable Modes:");
1724     seq_printf(file," host");
1725     if (get_cpu_var(core_state).have_cpufreq) { 
1726         seq_printf(file," external");
1727     }
1728     put_cpu_var(core_state);
1729     if (machine_state.supports_pstates) {
1730         seq_printf(file," direct");
1731     }
1732     seq_printf(file," internal\n");
1733
1734     return 0;
1735 }
1736
1737 static int pstate_hw_open(struct inode * inode, struct file * file) 
1738 {
1739     return single_open(file, pstate_hw_show, NULL);
1740 }
1741
1742
1743 static struct file_operations pstate_hw_fops = {
1744     .owner = THIS_MODULE,
1745     .open = pstate_hw_open, 
1746     .read = seq_read,
1747     .llseek = seq_lseek,
1748     .release = seq_release
1749 };
1750
1751
1752 int pstate_proc_setup(void)
1753 {
1754     struct proc_dir_entry *proc;
1755     struct proc_dir_entry *prochw;
1756
1757     PAL_PROC_CREATE(proc,"v3-dvfs",0444,palacios_get_procdir(),&pstate_fops);
1758
1759     if (!proc) { 
1760         ERROR("Failed to create proc entry for p-state control\n");
1761         return -1;
1762     }
1763
1764     INFO("/proc/v3vee/v3-dvfs successfully created\n");
1765
1766     PAL_PROC_CREATE(prochw,"v3-dvfs-hw",0444,palacios_get_procdir(),&pstate_hw_fops);
1767
1768     if (!prochw) { 
1769         ERROR("Failed to create proc entry for p-state hw info\n");
1770         return -1;
1771     }
1772
1773     INFO("/proc/v3vee/v3-dvfs-hw successfully created\n");
1774
1775     return 0;
1776 }
1777
1778 void pstate_proc_teardown(void)
1779 {
1780     remove_proc_entry("v3-dvfs-hw",palacios_get_procdir());
1781     remove_proc_entry("v3-dvfs",palacios_get_procdir());
1782 }
1783
1784 /********************************************************************
1785   User interface (ioctls)
1786  ********************************************************************/
1787
1788 static int dvfs_ctrl(unsigned int cmd, unsigned long arg) 
1789 {
1790     struct v3_dvfs_ctrl_request r;
1791
1792     if (copy_from_user(&r,(void __user*)arg,sizeof(struct v3_dvfs_ctrl_request))) {
1793         ERROR("Failed to copy DVFS request from user\n");
1794         return -EFAULT;
1795     }
1796
1797     if (r.pcore >= num_online_cpus()) {
1798         ERROR("Cannot apply DVFS request to pcore %u\n",r.pcore);
1799         return -EFAULT;
1800     }
1801
1802     switch (r.cmd) {
1803         case V3_DVFS_ACQUIRE: {
1804                                   switch (r.acq_type) { 
1805                                       case V3_DVFS_EXTERNAL:
1806                                           palacios_xcall(r.pcore,(void (*)(void*))palacios_pstate_ctrl_acquire_external, NULL);
1807                                           return 0;
1808                                           break;
1809                                       case V3_DVFS_DIRECT:
1810                                           palacios_xcall(r.pcore,(void (*)(void*))palacios_pstate_ctrl_acquire_direct, NULL);
1811                                           return 0;
1812                                           break;
1813                                       default:
1814                                           ERROR("Unknown DVFS acquire type %u\n",r.acq_type);
1815                                           return -EFAULT;
1816                                   }
1817                               }
1818                               break;
1819         case V3_DVFS_RELEASE: {
1820                                   palacios_xcall(r.pcore,(void (*)(void*))palacios_pstate_ctrl_release, NULL);
1821                                   return 0;
1822                               }
1823                               break;
1824         case V3_DVFS_SETFREQ: {
1825                                   palacios_xcall(r.pcore,(void (*)(void*))palacios_pstate_ctrl_set_freq,(void*)r.freq_khz);
1826                                   return 0;
1827                               }
1828                               break;
1829         case V3_DVFS_SETPSTATE: {
1830                                     palacios_xcall(r.pcore,(void (*)(void*))palacios_pstate_ctrl_set_pstate_wrapper,(void*)(uint64_t)r.pstate);
1831                                     return 0;
1832                                 }
1833         default: {
1834                      ERROR("Unknown DVFS command %u\n",r.cmd);
1835                      return -EFAULT;
1836                  }
1837                  break;
1838     }
1839 }
1840
1841
1842 void pstate_user_setup(void)
1843 {
1844     add_global_ctrl(V3_DVFS_CTRL, dvfs_ctrl);
1845 }
1846
1847
1848 void pstate_user_teardown(void)
1849 {
1850     remove_global_ctrl(V3_DVFS_CTRL);
1851 }
1852
1853 static struct v3_host_pstate_ctrl_iface hooks = {
1854     .get_chars = palacios_pstate_ctrl_get_chars,
1855     .acquire = palacios_pstate_ctrl_acquire,
1856     .release = palacios_pstate_ctrl_release,
1857     .set_pstate = palacios_pstate_ctrl_set_pstate,
1858     .get_pstate = palacios_pstate_ctrl_get_pstate,
1859     .set_freq = palacios_pstate_ctrl_set_freq,
1860     .get_freq = palacios_pstate_ctrl_get_freq,
1861 };
1862
1863
1864
1865 static int pstate_ctrl_init(void) 
1866 {
1867     unsigned int cpu;
1868     unsigned int numcpus = num_online_cpus();
1869
1870     pstate_arch_setup();
1871
1872     for (cpu=0;cpu<numcpus;cpu++) { 
1873         palacios_xcall(cpu,(void ((*)(void*)))init_core,0);
1874     }
1875
1876     V3_Init_Pstate_Ctrl(&hooks);  
1877
1878     if (pstate_proc_setup()) { 
1879         ERROR("Unable to initialize P-State Control\n");
1880         return -1;
1881     }
1882
1883     pstate_user_setup();
1884
1885     pstate_linux_init();
1886
1887     INFO("P-State Control Initialized\n");
1888
1889     return 0;
1890 }
1891
1892 static int pstate_ctrl_deinit(void)
1893 {
1894     unsigned int cpu;
1895     unsigned int numcpus=num_online_cpus();
1896
1897     pstate_linux_deinit();
1898
1899     pstate_user_teardown();
1900
1901     pstate_proc_teardown();
1902
1903     // release pstate control if we have it, and we need to do this on each processor
1904     for (cpu=0;cpu<numcpus;cpu++) { 
1905         palacios_xcall(cpu,(void (*)(void *))deinit_core,0);
1906     }
1907
1908
1909     // Free any mapping table we built for Intel
1910     if (intel_pstate_to_ctrl && intel_pstate_to_ctrl != intel_pstate_to_ctrl_internal) { 
1911         palacios_free(intel_pstate_to_ctrl);
1912     }
1913
1914
1915     return 0;
1916 }
1917
1918
1919 static struct linux_ext pstate_ext = {
1920     .name = "PSTATE_CTRL",
1921     .init = pstate_ctrl_init,
1922     .deinit = pstate_ctrl_deinit,
1923     .guest_init = NULL,
1924     .guest_deinit = NULL,
1925 };
1926
1927
1928 register_extension(&pstate_ext);
1929
1930
1931