Palacios Public Git Repository

To checkout Palacios execute

  git clone http://v3vee.org/palacios/palacios.web/palacios.git
This will give you the master branch. You probably want the devel branch or one of the release branches. To switch to the devel branch, simply execute
  cd palacios
  git checkout --track -b devel origin/devel
The other branches are similar.


2864ea05cc82ae174d1420853088d5bec07c9e87
[palacios.git] / linux_module / iface-pstate-ctrl.c
1 /*
2  * This file is part of the Palacios Virtual Machine Monitor developed
3  * by the V3VEE Project with funding from the United States National 
4  * Science Foundation and the Department of Energy.  
5  *
6  * The V3VEE Project is a joint project between Northwestern University
7  * and the University of New Mexico.  You can find out more at 
8  * http://www.v3vee.org
9  *
10  * Copyright (c) 2014, the V3VEE Project <http://www.v3vee.org>
11  * all rights reserved.
12  *
13  * Author: Kyle C. Hale <kh@u.northwestern.edu>
14  *         Shiva Rao <shiva.rao.717@gmail.com>
15  *         Peter Dinda <pdinda@northwestern.edu>
16  *
17  * This is free software.  you are permitted to use,
18  * redistribute, and modify it as specified in the file "V3VEE_LICENSE".
19  */
20
21 #include <linux/uaccess.h>
22 #include <linux/seq_file.h>
23 #include <linux/proc_fs.h>
24 #include <linux/cpufreq.h>
25 #include <linux/kernel.h>
26 #include <linux/kmod.h>
27 #include <linux/module.h>
28 #include <linux/string.h>
29 #include <linux/interrupt.h>
30 #include <asm/processor.h>
31 #include <asm/msr.h>
32 #include <asm/msr-index.h>
33
34 // Used to determine the appropriate pstates values on Intel
35 #include <linux/acpi.h>
36 #include <acpi/processor.h>
37
38 #include <interfaces/vmm_pstate_ctrl.h>
39
40 #include "palacios.h"
41 #include "iface-pstate-ctrl.h"
42
43 #include "linux-exts.h"
44
45 /*
46    This P-STATE control implementation includes the following modes.
47    You can switch between modes at any time.
48
49    - Internal control of processor states in Palacios (handoff from Linux)
50      When Palacios acuires this control, this module disables Linux cpufreq control
51      and allows code within Palacios unfettered access to the DVFS hardware. 
52    - Direct control of Intel and AMD processor pstates using code in this module
53      When you acquire this control, this module disables Linux cpufreq control
54      and directly programs the processor itself in response to your requests
55    - External control of processor states via Linux 
56      When you acuire this control, this module uses the Linux cpufreq control
57      to program the processor on your behelf
58    - Host control of processor stastes
59      This is the normal mode of DVFS control (e.g., Linux cpufreq)
60
61    Additionally, it provides a user-space interface for manipulating
62    p-state regardless of the host's functionality.  This includes
63    an ioctl for commanding the implementation and a /proc file for 
64    showing current status and capabilities.  From user space, you can
65    use the Direct, External, and Host modes.  
66
67    What we mean by "p-state" here is the processor's internal
68    configuration.   For AMD, this is defined as being the same as
69    the ACPI-defined p-state.  For Intel, it is not.  There, it is the 
70    contents of the perf ctl MSR, which is opaque.   We try hard to 
71    provide "p-states" that go from 0...max, by analogy or equivalence
72    to the ACPI p-states. 
73
74 */
75
76
77 #define PALACIOS_GOVNAME "v3vee"
78 #define MAX_PATH_LEN     128
79 #define MAX_GOV_NAME_LEN 16
80
81
82 struct pstate_core_info {
83     // Here we have the notion of host control
84 #define V3_PSTATE_HOST_CONTROL 0
85     // and all the modes from the Palacios interface:
86     // V3_PSTATE_EXTERNAL_CONTROL
87     // V3_PSTATE_DIRECT_CONTROL
88     // V3_PSTATE_INTERNAL_CONTROL
89     uint32_t mode;
90
91     // Apply if we are under the DIRECT state
92     uint64_t cur_pstate;
93     uint64_t max_pstate;
94     uint64_t min_pstate;
95
96     uint64_t cur_hw_pstate;
97
98     // Apply if we are under the EXTERNAL state
99     uint64_t set_freq_khz; // this is the frequency we're hoping to get
100     uint64_t cur_freq_khz;
101     uint64_t max_freq_khz;
102     uint64_t min_freq_khz;
103
104     // Intel-specific
105     uint8_t prior_speedstep;
106     uint8_t turbo_disabled;
107     uint8_t no_turbo;
108
109     int have_cpufreq;
110
111     // This is where we stash Linux's governor when we make a mode switch
112     char * linux_governor;
113     // We have this so we can restore the original frequency when we started
114     uint64_t original_hz; 
115
116 };
117
118
119 static DEFINE_PER_CPU(struct pstate_core_info, core_state);
120
121
122
123 // These are used to assert DIRECT control over the core pstates
124 struct pstate_core_funcs {
125     void    (*arch_init)(void);
126     void    (*arch_deinit)(void);
127     uint64_t (*get_min_pstate)(void);
128     uint64_t (*get_max_pstate)(void);
129     uint64_t (*get_pstate)(void);
130     void    (*set_pstate)(uint64_t pstate);
131 };
132
133 struct pstate_machine_info {
134     enum {INTEL, AMD, OTHER } arch;
135     int supports_pstates;
136
137
138     // For AMD
139     int have_pstate;
140     int have_coreboost;
141     int have_feedback;  
142
143     // For Intel
144     int have_speedstep;
145     int have_opportunistic; // this means "Turbo Boost" or "IDA"
146     int have_policy_hint;
147     int have_hwp;       // hardware-controlled performance states
148     int have_hdc;       // hardware duty cycling
149     int have_mwait_ext; // mwait power extensions
150     int have_mwait_int; // mwait wakes on interrupt
151
152     // for both
153     int have_pstate_hw_coord;  // mperf/aperf
154
155     // used for DIRECT control
156     struct pstate_core_funcs *funcs;
157
158 };
159
160 static struct pstate_machine_info machine_state;
161
162
163 /****************************************************
164   AMD  DIRECT CONTROL
165  ***************************************************/
166
167 /* AMD Programmer's Manual Vol 2 (Rev 3, 2013), Sec. 17.1, pp.557 */
168 #define MSR_PSTATE_LIMIT_REG_AMD 0xc0010061
169 #define MSR_PSTATE_CTL_REG_AMD   0xc0010062
170 #define MSR_PSTATE_STAT_REG_AMD  0xc0010063
171
172 struct p_state_limit_reg_amd {
173     union {
174         uint64_t val;
175         struct {
176             uint8_t  pstate_limit : 4; /* lowest P-state value (highest perf.) supported currently (this can change at runtime) */
177             uint8_t  pstate_max   : 4; /* highest P-state value supported  (lowest perf) */
178             uint64_t rsvd         : 56;
179         } reg;
180     } __attribute__((packed));
181 } __attribute__((packed));
182
183
184 struct p_state_stat_reg_amd {
185     union {
186         uint64_t val;
187         struct {
188             uint8_t  pstate  : 4;
189             uint64_t rsvd    : 60;
190         } reg;
191     } __attribute__((packed));
192 } __attribute__((packed));
193
194
195 struct p_state_ctl_reg_amd {
196     union {
197         uint64_t val;
198         struct {
199             uint8_t  cmd  : 4;
200             uint64_t rsvd : 60;
201         } reg;
202     } __attribute__((packed));
203 } __attribute__((packed));
204
205
206 /* CPUID Fn8000_0007_EDX[HwPstate(7)] = 1 */
207 static uint8_t supports_pstates_amd (void)
208 {
209     int i;
210     int mapwrong=0;
211     int amd_num_pstates;
212
213     uint32_t eax, ebx, ecx, edx;
214
215     cpuid(0x80000007, &eax, &ebx, &ecx, &edx);
216     machine_state.have_pstate = !!(edx & (1 << 7));
217     machine_state.have_coreboost = !!(edx & (1<<9));
218     machine_state.have_feedback = !!(edx & (1<<11));
219
220     cpuid(0x6, &eax, &ebx, &ecx, &edx);
221     machine_state.have_pstate_hw_coord =  !!(ecx & 1); 
222
223     INFO("P-State: AMD: Pstates=%d Coreboost=%d Feedback=%d PstateHWCoord=%d\n",
224             machine_state.have_pstate, 
225             machine_state.have_coreboost, 
226             machine_state.have_feedback,
227             machine_state.have_pstate_hw_coord);
228
229     amd_num_pstates = get_cpu_var(processors)->performance->state_count;
230     if (amd_num_pstates) { 
231         for (i=0;i<amd_num_pstates;i++) { 
232             INFO("P-State: %u: freq=%llu ctrl=%llx%s\n",
233                  i, 
234                  get_cpu_var(processors)->performance->states[i].core_frequency*1000,
235                  get_cpu_var(processors)->performance->states[i].control,
236                  get_cpu_var(processors)->performance->states[i].control != i ? (mapwrong=1, " ALERT - CTRL MAPPING NOT 1:1") : "");
237         }
238     }
239     if (mapwrong) { 
240         ERROR("P-State: AMD: mapping of pstate and control is not 1:1 on this processor - we will probably not work corrrectly\n");
241     }
242
243     return machine_state.have_pstate;
244
245
246 }
247
248
249 static void init_arch_amd(void)
250 {
251     /* KCH: nothing to do here */
252 }
253
254
255 static void deinit_arch_amd(void)
256 {
257     /* KCH: nothing to do here */
258 }
259
260
261 static uint64_t get_pstate_amd(void) 
262 {
263     struct p_state_stat_reg_amd pstat;
264
265     rdmsrl(MSR_PSTATE_STAT_REG_AMD, pstat.val);
266
267     get_cpu_var(core_state).cur_pstate=pstat.reg.pstate;
268     put_cpu_var(core_state);
269
270     return pstat.reg.pstate;
271 }
272
273
274 static void set_pstate_amd(uint64_t p)
275 {
276     struct p_state_ctl_reg_amd pctl;
277
278     if (p>get_cpu_var(core_state).max_pstate) { 
279         p=get_cpu_var(core_state).max_pstate;
280     }
281     put_cpu_var(core_state);
282
283     pctl.val = 0;
284     pctl.reg.cmd = p;
285
286     wrmsrl(MSR_PSTATE_CTL_REG_AMD, pctl.val);
287
288     get_cpu_var(core_state).cur_pstate=p;
289     put_cpu_var(core_state);
290 }
291
292
293 /*
294  * NOTE: HW may change this value at runtime
295  */
296 static uint64_t get_max_pstate_amd(void)
297 {
298     struct p_state_limit_reg_amd plimits;
299
300     rdmsrl(MSR_PSTATE_LIMIT_REG_AMD, plimits.val);
301
302     return plimits.reg.pstate_max;
303 }
304
305
306 static uint64_t get_min_pstate_amd(void)
307 {
308     struct p_state_limit_reg_amd plimits;
309
310     rdmsrl(MSR_PSTATE_LIMIT_REG_AMD, plimits.val);
311
312     return plimits.reg.pstate_limit;
313 }
314
315
316 static struct pstate_core_funcs amd_funcs =
317 {
318     .arch_init        = init_arch_amd,
319     .arch_deinit      = deinit_arch_amd,
320     .get_pstate       = get_pstate_amd,
321     .set_pstate       = set_pstate_amd,
322     .get_max_pstate   = get_max_pstate_amd,
323     .get_min_pstate   = get_min_pstate_amd,
324 };
325
326
327
328 /***********************************************************
329   INTEL DIRECT CONTROL
330  **********************************************************/
331
332
333 /*
334    This implementation uses SpeedStep, but does check
335    to see if the other features (MPERF/APERF, Turbo/IDA, HWP)
336    are available.
337 */
338
339 /* Intel System Programmer's Manual Vol. 3B, 14-2 */
340 #define MSR_MPERF_IA32         0x000000e7
341 #define MSR_APERF_IA32         0x000000e8
342 #define MSR_MISC_ENABLE_IA32   0x000001a0
343 #define MSR_NHM_TURBO_RATIO_LIMIT   0x000001ad
344 #define MSR_PLATFORM_INFO_IA32 0x000000ce
345 #define MSR_PERF_CTL_IA32      0x00000199
346 #define MSR_PERF_STAT_IA32     0x00000198
347 #define MSR_ENERY_PERF_BIAS_IA32 0x000001b0
348
349
350 /* Note that the actual  meaning of the pstate
351    in the control and status registers is actually
352    implementation dependent, unlike AMD.   The "official"
353    way to figure it out the mapping from pstate to 
354    these values is via ACPI.  What is written in the register
355    is an "id" of an operation point
356
357    "Often", the 16 bit field consists of a high order byte
358    which is the frequency (the multiplier) and the low order
359    byte is the voltage. 
360    */
361 // MSR_PERF_CTL_IA32  r/w
362 struct perf_ctl_reg_intel {
363     union {
364         uint64_t val;
365         struct {
366             // This is the target
367             // Note, not the ACPI pstate, but
368             // Intel's notion of pstate is that it's opaque
369             // for lots of implementations it seems to be
370             // frequency_id : voltage_id
371             // where frequency_id is typically the multiplier
372             uint16_t pstate                 : 16;
373             uint16_t reserved               : 16;
374             // set to 1 to *disengage* dynamic acceleration
375             // Note that "IDA" and "Turbo" use the same interface
376             uint16_t dynamic_accel_disable  : 1;
377             uint32_t reserved2              : 31;
378         } reg;
379     } __attribute__((packed));
380 } __attribute__((packed));
381
382 // MSR_PERF_STAT_IA32 r
383 struct perf_stat_reg_intel {
384     union {
385         uint64_t val;
386         struct {
387             // this is the current
388             uint16_t pstate                 : 16;
389             uint64_t reserved               : 48;
390         } reg;
391     } __attribute__((packed));
392 } __attribute__((packed));
393
394 // MSR_ENERGY_PERF_BIAS_IA32 r/w
395 struct enery_perf_bias_reg_intel {
396     union {
397         uint64_t val;
398         struct {
399             // this is the current
400             uint8_t  policy_hint            : 4;
401             uint64_t reserved               : 60;
402         } reg;
403     } __attribute__((packed));
404 } __attribute__((packed));
405
406 // MSR_PLATFORM_INFO
407 struct turbo_mode_info_reg_intel {
408     union {
409         uint64_t val;
410         struct {
411             uint8_t  rsvd0                  : 8;
412             uint8_t  max_noturbo_ratio      : 8;
413             uint8_t  rsvd1                  : 7;
414             uint8_t  ppin_cap               : 1;
415             uint8_t  rsvd2                  : 4;
416             uint8_t  ratio_limit            : 1; 
417             uint8_t  tdc_tdp_limit          : 1;
418             uint16_t rsvd3                  : 10;
419             uint8_t  min_ratio              : 8;
420             uint16_t rsvd4                  : 16;
421         } reg;
422     } __attribute__((packed));
423 } __attribute__((packed));
424
425 // This replicates the critical information in Linux's struct acpi_processor_px
426 // To make it easier to port to other OSes.    
427 struct intel_pstate_info {
428     uint64_t freq;  // KHz
429     uint64_t ctrl;  // What to write into the _CTL MSR to get this
430 };
431
432 // The internal array will be used if we cannot build the table locally
433 static struct intel_pstate_info *intel_pstate_to_ctrl_internal=0;
434 static int intel_num_pstates_internal=0;
435
436 // These will either point to the internal array or to a constructed array
437 static struct intel_pstate_info *intel_pstate_to_ctrl=0;
438 static int intel_num_pstates=0;
439
440
441 /* CPUID.01:ECX.AES(7) */
442 static uint8_t supports_pstates_intel(void)
443 {
444     /* NOTE: CPUID.06H:ECX.SETBH[bit 3] is set and it also implies the presence of a new architectural MSR called IA32_ENERGY_PERF_BIAS (1B0H).
445     */
446     uint32_t eax, ebx, ecx, edx;
447
448     cpuid(0x1, &eax, &ebx, &ecx, &edx);
449     machine_state.have_speedstep =  !!(ecx & (1 << 7));
450
451     cpuid(0x6, &eax, &ebx, &ecx, &edx);
452     machine_state.have_pstate_hw_coord =  !!(ecx & 1); // ?
453     machine_state.have_opportunistic =  !!(eax & 1<<1);
454     machine_state.have_policy_hint = !!(ecx & 1<<3);
455     machine_state.have_hwp = !!(eax & 1<<7);
456     machine_state.have_hdc = !!(eax & 1<<13);
457
458     cpuid(0x5, &eax, &ebx, &ecx, &edx);
459     machine_state.have_mwait_ext =  !!(ecx & 1);
460     machine_state.have_mwait_int =  !!(ecx & 1<<1);
461
462
463     // Note we test all the available hardware features documented as of August 2014
464     // We are only currently using speed_step, however.
465
466     INFO("P-State: Intel: Speedstep=%d, PstateHWCoord=%d, Opportunistic=%d PolicyHint=%d HWP=%d HDC=%d, MwaitExt=%d MwaitInt=%d \n",
467             machine_state.have_speedstep, 
468             machine_state.have_pstate_hw_coord, 
469             machine_state.have_opportunistic,
470             machine_state.have_policy_hint,
471             machine_state.have_hwp,
472             machine_state.have_hdc,
473             machine_state.have_mwait_ext,
474             machine_state.have_mwait_int );
475
476
477     if (machine_state.have_speedstep) {
478         uint32_t i;
479         // Build mapping table (from "pstate" (0..) to ctrl value for MSR
480         if (!(get_cpu_var(processors)) || !(get_cpu_var(processors)->performance) ) { 
481             put_cpu_var(processors);
482             // no acpi...  revert to internal table
483             intel_pstate_to_ctrl=intel_pstate_to_ctrl_internal;
484             intel_num_pstates=intel_num_pstates_internal;
485         } else {
486             intel_num_pstates = get_cpu_var(processors)->performance->state_count;
487             if (intel_num_pstates) { 
488                 intel_pstate_to_ctrl = palacios_alloc(sizeof(struct intel_pstate_info)*intel_num_pstates);
489                 if (!intel_pstate_to_ctrl) { 
490                     ERROR("P-State: Cannot allocate space for mapping...\n");
491                     intel_num_pstates=0;
492                 }
493                 for (i=0;i<intel_num_pstates;i++) { 
494                     intel_pstate_to_ctrl[i].freq = get_cpu_var(processors)->performance->states[i].core_frequency*1000;
495                     intel_pstate_to_ctrl[i].ctrl = get_cpu_var(processors)->performance->states[i].control;
496                 }
497                     
498             } else {
499                 ERROR("P-State: Strange, machine has ACPI DVFS but no states...\n");
500             }
501         }
502         put_cpu_var(processors);
503         INFO("P-State: Intel - State Mapping (%u states) follows\n",intel_num_pstates);
504         for (i=0;i<intel_num_pstates;i++) {
505             INFO("P-State: Intel Mapping %u:  freq=%llu  ctrl=%llx\n",
506                  i, intel_pstate_to_ctrl[i].freq,intel_pstate_to_ctrl[i].ctrl);
507         }
508     } else {
509         INFO("P-State: Intel:  No speedstep here\n");
510     }
511         
512
513     return machine_state.have_speedstep;
514 }
515
516
517 static void init_arch_intel(void)
518 {
519     uint64_t val;
520
521     rdmsrl(MSR_MISC_ENABLE_IA32, val);
522
523     //INFO("P-State: prior ENABLE=%llx\n",val);
524
525     // store prior speedstep setting
526     get_cpu_var(core_state).prior_speedstep=(val >> 16) & 0x1;
527     put_cpu_var(core_state);
528
529     // enable speedstep (probably already on)
530     val |= 1 << 16;
531     wrmsrl(MSR_MISC_ENABLE_IA32, val);
532
533     //INFO("P-State: write ENABLE=%llx\n",val);
534
535 }
536
537 static void deinit_arch_intel(void)
538 {
539     uint64_t val;
540
541     rdmsrl(MSR_MISC_ENABLE_IA32, val);
542
543     //INFO("P-State: deinit: ENABLE=%llx\n",val);
544
545     val &= ~(1ULL << 16);
546     val |= get_cpu_var(core_state).prior_speedstep << 16;
547     put_cpu_var(core_state);
548
549     wrmsrl(MSR_MISC_ENABLE_IA32, val);
550
551     //INFO("P-state: deinit ENABLE=%llx\n",val);
552
553 }
554
555 /* TODO: Intel P-states require sampling at intervals... */
556 static uint64_t get_pstate_intel(void)
557 {
558     uint64_t val;
559
560     rdmsrl(MSR_PERF_STAT_IA32,val);
561
562     //INFO("P-State: Get: 0x%llx\n", val);
563
564     // should check if turbo is active, in which case 
565     // this value is not the whole story
566
567     return val;
568 }
569
570 static void set_pstate_intel(uint64_t p)
571 {
572     uint64_t val;
573     uint64_t ctrl;
574
575     if (intel_num_pstates==0) { 
576         return ;
577     } else {
578         if (p>=intel_num_pstates) { 
579             p=intel_num_pstates-1;
580         }
581     }
582
583     ctrl=intel_pstate_to_ctrl[p].ctrl;
584
585     /* ...Intel IDA (dynamic acceleration)
586        if (c->no_turbo && !c->turbo_disabled) {
587        val |= 1 << 32;
588        }
589        */
590     // leave all bits along expect for the likely
591     // fid bits
592
593     rdmsrl(MSR_PERF_CTL_IA32, val);
594     //INFO("P-State: Pre-Set: 0x%llx\n", val);
595
596     val &= ~0xffffULL;
597     val |= ctrl & 0xffffULL;
598
599     //INFO("P-State: Set: 0x%llx\n", val);
600
601     wrmsrl(MSR_PERF_CTL_IA32, val);
602
603     get_cpu_var(core_state).cur_pstate = p;
604     put_cpu_var(core_state);
605 }
606
607
608 static uint64_t get_min_pstate_intel(void)
609 {
610     return 0;
611 }
612
613
614
615 static uint64_t get_max_pstate_intel (void)
616 {
617     if (intel_num_pstates==0) { 
618         return 0;
619     } else {
620         return intel_num_pstates-1;
621     }
622 }
623
624 static struct pstate_core_funcs intel_funcs =
625 {
626     .arch_init        = init_arch_intel,
627     .arch_deinit      = deinit_arch_intel,
628     .get_pstate       = get_pstate_intel,
629     .set_pstate       = set_pstate_intel,
630     .get_max_pstate   = get_max_pstate_intel,
631     .get_min_pstate   = get_min_pstate_intel,
632 };
633
634
635
636 /***********************************************
637   Arch determination and setup
638  ***********************************************/
639
640 static inline void cpuid_string (uint32_t id, uint32_t dest[4]) 
641 {
642     asm volatile("cpuid"
643             :"=a"(*dest),"=b"(*(dest+1)),"=c"(*(dest+2)),"=d"(*(dest+3))
644             :"a"(id));
645 }
646
647
648 static int get_cpu_vendor (char name[13])
649 {
650     uint32_t dest[4];
651     uint32_t maxid;
652
653     cpuid_string(0,dest);
654     maxid=dest[0];
655     ((uint32_t*)name)[0]=dest[1];
656     ((uint32_t*)name)[1]=dest[3];
657     ((uint32_t*)name)[2]=dest[2];
658     name[12]=0;
659
660     return maxid;
661 }
662
663
664 static int is_intel (void)
665 {
666     char name[13];
667     get_cpu_vendor(name);
668     return !strcmp(name,"GenuineIntel");
669 }
670
671
672 static int is_amd (void)
673 {
674     char name[13];
675     get_cpu_vendor(name);
676     return !strcmp(name,"AuthenticAMD");
677 }
678
679 static int pstate_arch_setup(void)
680 {
681
682     if (is_amd()) {
683         machine_state.arch = AMD;
684         machine_state.funcs = &amd_funcs;
685         machine_state.supports_pstates = supports_pstates_amd();
686         INFO("PSTATE: P-State initialized for AMD\n");
687     } else if (is_intel()) {
688         machine_state.arch  = INTEL;
689         machine_state.funcs = &intel_funcs;
690         machine_state.supports_pstates = supports_pstates_intel();
691         INFO("PSTATE: P-State initialized for INTEL (Work in progress...)\n");
692         return 0;
693
694     } else {
695         machine_state.arch = OTHER;
696         machine_state.funcs = NULL;
697         machine_state.supports_pstates = 0;
698         INFO("PSTATE: P-state control: No support for direct control on this architecture\n");
699         return 0;
700     }
701
702     return 0;
703 }
704
705
706
707 /******************************************************************
708   Linux Interface
709  *****************************************************************/
710
711 static unsigned cpus_using_v3_governor;
712 static DEFINE_MUTEX(v3_governor_mutex);
713
714 /* KCH: this will tell us when there is an actual frequency transition */
715 static int v3_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
716         void *data)
717 {
718     struct cpufreq_freqs *freq = data;
719
720     if (per_cpu(core_state, freq->cpu).mode != V3_PSTATE_EXTERNAL_CONTROL) {
721         return 0;
722     }
723
724     if (val == CPUFREQ_POSTCHANGE) {
725         DEBUG("P-State: frequency change took effect on cpu %u (now %u kHz)\n",
726                 freq->cpu, freq->new);
727         per_cpu(core_state, freq->cpu).cur_freq_khz = freq->new;
728     }
729
730     return 0;
731
732 }
733
734
735 static struct notifier_block v3_cpufreq_notifier_block = {
736     .notifier_call = v3_cpufreq_notifier
737 };
738
739
740 /* 
741  * This stub governor is simply a placeholder for preventing 
742  * frequency changes from the Linux side. For now, we simply leave
743  * the frequency as is when we acquire control. 
744  */
745 static int governor_run(struct cpufreq_policy *policy, unsigned int event)
746 {
747     unsigned cpu = policy->cpu;
748
749     switch (event) {
750         /* we can't use cpufreq_driver_target here as it can result
751          * in a circular dependency, so we'll keep the current frequency as is
752          */
753         case CPUFREQ_GOV_START:
754             BUG_ON(!policy->cur);
755
756             mutex_lock(&v3_governor_mutex);
757
758             if (cpus_using_v3_governor == 0) {
759                 cpufreq_register_notifier(&v3_cpufreq_notifier_block,
760                         CPUFREQ_TRANSITION_NOTIFIER);
761             }
762
763             cpus_using_v3_governor++;
764
765             per_cpu(core_state, cpu).set_freq_khz = policy->cur;
766             per_cpu(core_state, cpu).cur_freq_khz = policy->cur;
767             per_cpu(core_state, cpu).max_freq_khz = policy->max;
768             per_cpu(core_state, cpu).min_freq_khz = policy->min;
769
770             mutex_unlock(&v3_governor_mutex);
771             break;
772         case CPUFREQ_GOV_STOP:
773             mutex_lock(&v3_governor_mutex);
774
775             cpus_using_v3_governor--;
776
777             if (cpus_using_v3_governor == 0) {
778                 cpufreq_unregister_notifier(
779                         &v3_cpufreq_notifier_block,
780                         CPUFREQ_TRANSITION_NOTIFIER);
781             }
782
783             per_cpu(core_state, cpu).set_freq_khz = 0;
784             per_cpu(core_state, cpu).cur_freq_khz = 0;
785             per_cpu(core_state, cpu).max_freq_khz = 0;
786             per_cpu(core_state, cpu).min_freq_khz = 0;
787
788             mutex_unlock(&v3_governor_mutex);
789             break;
790         case CPUFREQ_GOV_LIMITS:
791             /* do nothing */
792             break;
793         default:
794             ERROR("Undefined governor command (%u)\n", event);
795             return -1;
796     }                           
797
798     return 0;
799 }
800
801
802 static struct cpufreq_governor stub_governor = 
803 {
804     .name = PALACIOS_GOVNAME,
805     .governor = governor_run,
806     .owner = THIS_MODULE,
807 };
808
809
810 static struct workqueue_struct *pstate_wq;
811
812 typedef struct {
813     struct work_struct work;
814     uint64_t freq;
815 } pstate_work_t;
816
817
818
819 static inline void pstate_register_linux_governor(void)
820 {
821     cpufreq_register_governor(&stub_governor);
822 }
823
824
825 static inline void pstate_unregister_linux_governor(void)
826 {
827     cpufreq_unregister_governor(&stub_governor);
828 }
829
830
831 static int pstate_linux_init(void)
832 {
833     pstate_register_linux_governor();
834     pstate_wq = create_workqueue("v3vee_pstate_wq");
835     if (!pstate_wq) {
836         ERROR("Could not create work queue\n");
837         goto out_err;
838     }
839
840     return 0;
841
842 out_err:
843     pstate_unregister_linux_governor();
844     return -1;
845 }
846
847
848 static void pstate_linux_deinit(void)
849 {
850     pstate_unregister_linux_governor();
851     flush_workqueue(pstate_wq);
852     destroy_workqueue(pstate_wq);
853 }
854
855
856 static int get_current_governor(char **buf, unsigned int cpu)
857 {
858     struct cpufreq_policy * policy = palacios_alloc(sizeof(struct cpufreq_policy));
859     char * govname = NULL;
860
861     if (!policy) {
862         ERROR("could not allocate cpufreq_policy\n");
863         return -1;
864     }
865         
866     if (cpufreq_get_policy(policy, cpu) != 0) {
867         ERROR("Could not get current cpufreq policy\n");
868         goto out_err;
869     }
870
871     /* We're in interrupt context, should probably not wait here */
872     govname = palacios_alloc(MAX_GOV_NAME_LEN);
873     if (!govname) {
874         ERROR("Could not allocate space for governor name\n");
875         goto out_err;
876     }
877
878     strncpy(govname, policy->governor->name, MAX_GOV_NAME_LEN);
879
880     get_cpu_var(core_state).linux_governor = govname;
881     put_cpu_var(core_state);
882
883     *buf = govname;
884
885     palacios_free(policy);
886
887     return 0;
888
889 out_err:
890     palacios_free(policy);
891     return -1;
892 }
893
894
895 /* passed to the userspacehelper interface for cleanup */
896 static void gov_switch_cleanup(struct subprocess_info * s)
897 {
898     palacios_free(s->argv[2]);
899     palacios_free(s->argv);
900 }
901
902
903 /* 
904  * Switch governors
905  * @s - the governor to switch to 
906  * TODO: this should probably be submitted to a work queue
907  * so we don't have to run it in interrupt context
908  */
909 static int governor_switch(char * s, unsigned int cpu)
910 {
911     char * path_str = NULL;
912     char ** argv = NULL; 
913
914     static char * envp[] = {
915         "HOME=/",
916         "TERM=linux",
917         "PATH=/sbin:/bin:/usr/sbin:/usr/bin", NULL };
918
919
920     argv = palacios_alloc(4*sizeof(char*));
921     if (!argv) {
922         ERROR("Couldn't allocate argv struct\n");
923         return -1;
924     }
925
926     path_str = palacios_alloc(MAX_PATH_LEN);
927     if (!path_str) {
928         ERROR("Couldn't allocate path string\n");
929         goto out_freeargv;
930     }
931     memset(path_str, 0, MAX_PATH_LEN);
932
933     snprintf(path_str, MAX_PATH_LEN, "echo %s > /sys/devices/system/cpu/cpu%u/cpufreq/scaling_governor", s, cpu);
934
935     argv[0] = "/bin/sh";
936     argv[1] = "-c";
937     argv[2] = path_str;
938     argv[3] = NULL;
939
940     /* KCH: we can't wait here to actually see if we succeeded, we're in interrupt context */
941     return call_usermodehelper_fns("/bin/sh", argv, envp, UMH_NO_WAIT, NULL, gov_switch_cleanup, NULL);
942
943 out_freeargv:
944     palacios_free(argv);
945     return -1;
946 }
947
948
949 static inline void free_linux_governor(void)
950 {
951     palacios_free(get_cpu_var(core_state).linux_governor);
952     put_cpu_var(core_state);
953 }
954
955
956 static int linux_setup_palacios_governor(void)
957 {
958     char * gov;
959     unsigned int cpu = get_cpu();
960     put_cpu();
961
962     /* KCH:  we assume the v3vee governor is already 
963      * registered with kernel by this point 
964      */
965
966     if (get_current_governor(&gov, cpu) < 0) {
967         ERROR("Could not get current governor\n");
968         return -1;
969     }
970
971     DEBUG("saving current governor (%s)\n", gov);
972
973     get_cpu_var(core_state).linux_governor = gov;
974     put_cpu_var(core_state);
975     
976     DEBUG("setting the new governor (%s)\n", PALACIOS_GOVNAME);
977
978     /* set the new one to ours */
979
980     if (governor_switch(PALACIOS_GOVNAME, cpu) < 0) {
981         ERROR("Could not set governor to (%s)\n", PALACIOS_GOVNAME);
982         return -1;
983     }
984
985     return 0;
986 }
987
988
989
990 static uint64_t linux_get_pstate(void)
991 {
992     struct cpufreq_policy * policy = NULL;
993     struct cpufreq_frequency_table *table;
994     unsigned int i = 0;
995     unsigned int count = 0;
996     unsigned int cpu = get_cpu(); 
997     put_cpu();
998
999
1000     policy = palacios_alloc(sizeof(struct cpufreq_policy));
1001     if (!policy) {
1002         ERROR("Could not allocate policy struct\n");
1003         return -1;
1004     }
1005
1006     cpufreq_get_policy(policy, cpu);
1007     table = cpufreq_frequency_get_table(cpu);
1008
1009     for (i = 0; (table[i].frequency != CPUFREQ_TABLE_END); i++) {
1010
1011         if (table[i].frequency == CPUFREQ_ENTRY_INVALID) {
1012             continue;
1013         }
1014
1015         if (table[i].frequency == policy->cur) {
1016             break;
1017         }
1018
1019         count++;
1020     }
1021
1022     palacios_free(policy);
1023
1024     put_cpu();
1025     return count;
1026 }
1027
1028
1029 static uint64_t linux_get_freq(void)
1030 {
1031     uint64_t freq;
1032     struct cpufreq_policy * policy = NULL;
1033     unsigned int cpu = get_cpu();
1034     put_cpu();
1035
1036     policy = palacios_alloc(sizeof(struct cpufreq_policy));
1037     if (!policy) {
1038         ERROR("Could not allocate policy struct\n");
1039         return -1;
1040     }
1041
1042     if (cpufreq_get_policy(policy, cpu)) {
1043         ERROR("Could not get current policy\n");
1044         return -1;
1045     }
1046
1047     freq=policy->cur;
1048
1049     palacios_free(policy);
1050
1051     return freq;
1052 }
1053
1054 static void  
1055 pstate_switch_workfn (struct work_struct *work)
1056 {
1057     pstate_work_t * pwork = (pstate_work_t*)work;
1058     struct cpufreq_policy * policy = NULL;
1059     uint64_t freq; 
1060     unsigned int cpu = get_cpu();
1061     put_cpu();
1062
1063     mutex_lock(&v3_governor_mutex);
1064
1065     policy = palacios_alloc(sizeof(struct cpufreq_policy));
1066     if (!policy) {
1067         ERROR("Could not allocate space for cpufreq policy\n");
1068         goto out;
1069     }
1070
1071     if (cpufreq_get_policy(policy, cpu) != 0) {
1072         ERROR("Could not get cpufreq policy\n");
1073         goto out1;
1074     }
1075
1076     freq = pwork->freq;
1077     get_cpu_var(core_state).set_freq_khz = freq;
1078
1079     if (freq < get_cpu_var(core_state).min_freq_khz) {
1080         freq = get_cpu_var(core_state).min_freq_khz;
1081     }
1082     if (freq > get_cpu_var(core_state).max_freq_khz) {
1083         freq = get_cpu_var(core_state).max_freq_khz;
1084     }
1085     put_cpu_var(core_state);
1086
1087     INFO("P-state: requesting frequency change on core %u to %llu\n", cpu, freq);
1088     __cpufreq_driver_target(policy, freq, CPUFREQ_RELATION_L);
1089
1090 out1:
1091     palacios_free(policy);
1092 out:
1093     palacios_free(work);
1094     mutex_unlock(&v3_governor_mutex);
1095
1096
1097
1098 static int linux_set_pstate(uint64_t p)
1099 {
1100     struct cpufreq_policy * policy = NULL;
1101     struct cpufreq_frequency_table *table;
1102     pstate_work_t * work = NULL;
1103     unsigned int i = 0;
1104     unsigned int count = 0;
1105     int state_set = 0;
1106     int last_valid = 0;
1107     unsigned int cpu = get_cpu();
1108     put_cpu();
1109
1110     policy = palacios_alloc(sizeof(struct cpufreq_policy));
1111     if (!policy) {
1112         ERROR("Could not allocate policy struct\n");
1113         return -1;
1114     }
1115
1116     work = (pstate_work_t*)palacios_alloc(sizeof(pstate_work_t));
1117     if (!work) {
1118         ERROR("Could not allocate work struct\n");
1119         goto out_err;
1120     }
1121
1122     if (cpufreq_get_policy(policy, cpu)) {
1123         ERROR("Could not get current policy\n");
1124         goto out_err1;
1125     }
1126     table = cpufreq_frequency_get_table(cpu);
1127
1128     for (i = 0; (table[i].frequency != CPUFREQ_TABLE_END); i++) {
1129
1130         if (table[i].frequency == CPUFREQ_ENTRY_INVALID) {
1131             continue;
1132         }
1133
1134         if (count == p) {
1135
1136             INIT_WORK((struct work_struct*)work, pstate_switch_workfn);
1137             work->freq = table[i].frequency;
1138             queue_work(pstate_wq, (struct work_struct*)work);
1139
1140             state_set = 1;
1141             break;
1142         }
1143
1144         count++;
1145         last_valid = i;
1146     }
1147
1148     /* we need to deal with the case in which we get a number > max pstate */
1149     if (!state_set) {
1150         INIT_WORK((struct work_struct*)work, pstate_switch_workfn);
1151         work->freq = table[last_valid].frequency;
1152         queue_work(pstate_wq, (struct work_struct*)work);
1153     }
1154
1155     palacios_free(policy);
1156     return 0;
1157
1158 out_err1: 
1159     palacios_free(work);
1160 out_err:
1161     palacios_free(policy);
1162     return -1;
1163 }
1164
1165
1166 static int linux_set_freq(uint64_t f)
1167 {
1168     struct cpufreq_policy * policy = NULL;
1169     pstate_work_t * work = NULL;
1170     uint64_t freq;
1171     unsigned int cpu = get_cpu();
1172     put_cpu();
1173
1174     policy = palacios_alloc(sizeof(struct cpufreq_policy));
1175     if (!policy) {
1176         ERROR("Could not allocate policy struct\n");
1177         return -1;
1178     }
1179
1180     work = (pstate_work_t*)palacios_alloc(sizeof(pstate_work_t));
1181     if (!work) {
1182         ERROR("Could not allocate work struct\n");
1183         goto out_err;
1184     }
1185
1186     if (cpufreq_get_policy(policy, cpu) != 0) {
1187         ERROR("Could not get cpufreq policy\n");
1188         goto out_err1;
1189     }
1190
1191     if (f < policy->min) {
1192         freq = policy->min;
1193     } else if (f > policy->max) {
1194         freq = policy->max;
1195     } else {
1196         freq = f;
1197     }
1198
1199     INIT_WORK((struct work_struct*)work, pstate_switch_workfn);
1200     work->freq = freq;
1201     queue_work(pstate_wq, (struct work_struct*)work);
1202
1203     palacios_free(policy);
1204     return 0;
1205
1206 out_err1:
1207     palacios_free(work);
1208 out_err:
1209     palacios_free(policy);
1210     return -1;
1211 }
1212
1213
1214 static int linux_restore_defaults(void)
1215 {
1216     char * gov = NULL;
1217     unsigned int cpu = get_cpu();
1218     put_cpu();
1219
1220     gov = get_cpu_var(core_state).linux_governor;
1221     put_cpu_var(core_state);
1222
1223     DEBUG("restoring previous governor (%s)\n", gov);
1224
1225     if (governor_switch(gov, cpu) < 0) {
1226         ERROR("Could not restore governor to (%s)\n", gov);
1227         goto out_err;
1228     }
1229
1230     free_linux_governor();
1231     return 0;
1232
1233 out_err:
1234     free_linux_governor();
1235     return -1;
1236 }
1237
1238
1239
1240 /******************************************************************
1241   Generic Interface as provided to Palacios and to the rest of the
1242   module
1243  ******************************************************************/
1244
1245 static void init_core(void)
1246 {
1247     unsigned cpu;
1248     struct cpufreq_policy *p;
1249
1250
1251     //DEBUG("P-State Core Init\n");
1252
1253     get_cpu_var(core_state).mode = V3_PSTATE_HOST_CONTROL;
1254     get_cpu_var(core_state).cur_pstate = 0;
1255
1256     if (machine_state.funcs) {
1257         get_cpu_var(core_state).min_pstate = machine_state.funcs->get_min_pstate();
1258         get_cpu_var(core_state).max_pstate = machine_state.funcs->get_max_pstate();
1259     } else {
1260         get_cpu_var(core_state).min_pstate = 0;
1261         get_cpu_var(core_state).max_pstate = 0;
1262     }
1263
1264
1265     cpu = get_cpu(); put_cpu();
1266
1267     p = cpufreq_cpu_get(cpu);
1268
1269     if (!p) { 
1270         get_cpu_var(core_state).have_cpufreq = 0;
1271         get_cpu_var(core_state).min_freq_khz=0;
1272         get_cpu_var(core_state).max_freq_khz=0;
1273         get_cpu_var(core_state).cur_freq_khz=0;
1274     } else {
1275         get_cpu_var(core_state).have_cpufreq = 1;
1276         get_cpu_var(core_state).min_freq_khz=p->min;
1277         get_cpu_var(core_state).max_freq_khz=p->max;
1278         get_cpu_var(core_state).cur_freq_khz=p->cur; } cpufreq_cpu_put(p); 
1279     put_cpu_var(core_state);
1280
1281     /*
1282     for (i=0;i<get_cpu_var(processors)->performance->state_count; i++) { 
1283         INFO("P-State: %u: freq=%llu ctrl=%llx",
1284                 i, 
1285                 get_cpu_var(processors)->performance->states[i].core_frequency*1000,
1286                 get_cpu_var(processors)->performance->states[i].control);
1287    }
1288    put_cpu_var(processors);
1289     */
1290 }
1291
1292
1293 void palacios_pstate_ctrl_release(void);
1294
1295
1296 static void deinit_core(void)
1297 {
1298     DEBUG("P-State Core Deinit\n");
1299     palacios_pstate_ctrl_release();
1300
1301 }
1302
1303
1304
1305 void palacios_pstate_ctrl_get_chars(struct v3_cpu_pstate_chars *c) 
1306 {
1307     memset(c,0,sizeof(struct v3_cpu_pstate_chars));
1308
1309
1310     c->features = V3_PSTATE_INTERNAL_CONTROL;
1311
1312     if (get_cpu_var(core_state).have_cpufreq) {
1313         c->features |= V3_PSTATE_EXTERNAL_CONTROL;
1314     }
1315
1316     if (machine_state.arch==AMD || machine_state.arch==INTEL) { 
1317         c->features |= V3_PSTATE_DIRECT_CONTROL;
1318     }
1319     c->cur_mode = get_cpu_var(core_state).mode;
1320     c->min_pstate = get_cpu_var(core_state).min_pstate;
1321     c->max_pstate = get_cpu_var(core_state).max_pstate;
1322     c->cur_pstate = get_cpu_var(core_state).cur_pstate;
1323     c->min_freq_khz = get_cpu_var(core_state).min_freq_khz;
1324     c->max_freq_khz = get_cpu_var(core_state).max_freq_khz;
1325     c->cur_freq_khz = get_cpu_var(core_state).cur_freq_khz;
1326
1327     put_cpu_var(core_state);
1328
1329
1330
1331 }
1332
1333
1334 uint64_t palacios_pstate_ctrl_get_pstate(void)
1335 {
1336     if (get_cpu_var(core_state).mode==V3_PSTATE_DIRECT_CONTROL) { 
1337         put_cpu_var(core_state);
1338         return machine_state.funcs->get_pstate();
1339     } else if (get_cpu_var(core_state).mode==V3_PSTATE_EXTERNAL_CONTROL) {
1340         put_cpu_var(core_state);
1341         return linux_get_pstate();
1342     } else {
1343         put_cpu_var(core_state);
1344         return 0;
1345     }
1346 }
1347
1348
1349 void palacios_pstate_ctrl_set_pstate(uint64_t p)
1350 {
1351     if (get_cpu_var(core_state).mode==V3_PSTATE_DIRECT_CONTROL) { 
1352         put_cpu_var(core_state);
1353         machine_state.funcs->set_pstate(p);
1354     } else if (get_cpu_var(core_state).mode==V3_PSTATE_EXTERNAL_CONTROL) {
1355         put_cpu_var(core_state);
1356         linux_set_pstate(p);
1357     } else {
1358         put_cpu_var(core_state);
1359     }
1360 }
1361
1362
1363 void palacios_pstate_ctrl_set_pstate_wrapper(void *p)
1364 {
1365     palacios_pstate_ctrl_set_pstate((uint8_t)(uint64_t)p);
1366 }
1367
1368
1369 uint64_t palacios_pstate_ctrl_get_freq(void)
1370 {
1371     if (get_cpu_var(core_state).mode==V3_PSTATE_EXTERNAL_CONTROL) { 
1372         put_cpu_var(core_state);
1373         return linux_get_freq();
1374     } else {
1375         put_cpu_var(core_state);
1376         return 0;
1377     }
1378 }
1379
1380
1381 void palacios_pstate_ctrl_set_freq(uint64_t p)
1382 {
1383     if (get_cpu_var(core_state).mode==V3_PSTATE_EXTERNAL_CONTROL) { 
1384         put_cpu_var(core_state);
1385         linux_set_freq(p);
1386     } else {
1387         put_cpu_var(core_state);
1388     }
1389 }
1390
1391
1392 static int switch_to_external(void)
1393 {
1394     DEBUG("switch from host control to external\n");
1395
1396     if (!(get_cpu_var(core_state).have_cpufreq)) {
1397         put_cpu_var(core_state);
1398         ERROR("No cpufreq  - cannot switch to external...\n");
1399         return -1;
1400     } 
1401     put_cpu_var(core_state);
1402
1403     linux_setup_palacios_governor();
1404
1405     get_cpu_var(core_state).mode=V3_PSTATE_EXTERNAL_CONTROL;
1406     put_cpu_var(core_state);
1407
1408     return 0;
1409 }
1410
1411
1412 static int switch_to_direct(void)
1413 {
1414     DEBUG("switch from host control to direct\n");
1415
1416     if (get_cpu_var(core_state).have_cpufreq) { 
1417         put_cpu_var(core_state);
1418         DEBUG("switch to direct from cpufreq\n");
1419
1420         // The implementation would set the policy and governor to peg cpu
1421         // regardless of load
1422         linux_setup_palacios_governor();
1423     } else {
1424         put_cpu_var(core_state);
1425     }
1426
1427     if (machine_state.funcs && machine_state.funcs->arch_init) {
1428         get_cpu_var(core_state).mode=V3_PSTATE_DIRECT_CONTROL;
1429
1430         machine_state.funcs->arch_init();
1431
1432         put_cpu_var(core_state);
1433     }
1434
1435     return 0;
1436 }
1437
1438
1439 static int switch_to_internal(void)
1440 {
1441     DEBUG("switch from host control to internal\n");
1442
1443     if (get_cpu_var(core_state).have_cpufreq) { 
1444         put_cpu_var(core_state);
1445         DEBUG("switch to internal on machine with cpu freq\n");
1446         linux_setup_palacios_governor();
1447     } else {
1448         put_cpu_var(core_state);
1449     }
1450
1451     get_cpu_var(core_state).mode=V3_PSTATE_INTERNAL_CONTROL;
1452
1453     put_cpu_var(core_state);
1454
1455     return 0;
1456 }
1457
1458
1459 static int switch_from_external(void)
1460 {
1461     if (!(get_cpu_var(core_state).have_cpufreq)) {
1462         put_cpu_var(core_state);
1463         ERROR("No cpufreq  - how did we get here... external...\n");
1464         return -1;
1465     }
1466     put_cpu_var(core_state);
1467
1468     DEBUG("Switching back to host control from external\n");
1469
1470     if (get_cpu_var(core_state).have_cpufreq) { 
1471         put_cpu_var(core_state);
1472         linux_restore_defaults();
1473     } else {
1474         put_cpu_var(core_state);
1475     }
1476
1477     get_cpu_var(core_state).mode = V3_PSTATE_HOST_CONTROL;
1478     put_cpu_var(core_state);
1479
1480     return 0;
1481 }
1482
1483
1484 static int switch_from_direct(void)
1485 {
1486
1487     DEBUG("Switching back to host control from direct\n");
1488
1489     // Set maximum performance, just in case there is no host control
1490     machine_state.funcs->set_pstate(get_cpu_var(core_state).min_pstate);
1491     machine_state.funcs->arch_deinit();
1492
1493     if (get_cpu_var(core_state).have_cpufreq) { 
1494         put_cpu_var(core_state);
1495         linux_restore_defaults();
1496     } else {
1497         put_cpu_var(core_state);
1498     }
1499
1500     get_cpu_var(core_state).mode=V3_PSTATE_HOST_CONTROL;
1501
1502     put_cpu_var(core_state);
1503
1504     return 0;
1505 }
1506
1507
1508 static int switch_from_internal(void)
1509 {
1510     DEBUG("Switching back to host control from internal\n");
1511
1512     if (get_cpu_var(core_state).have_cpufreq) { 
1513         put_cpu_var(core_state);
1514         linux_restore_defaults();
1515     } else {
1516         put_cpu_var(core_state);
1517     }
1518
1519     get_cpu_var(core_state).mode=V3_PSTATE_HOST_CONTROL;
1520
1521     put_cpu_var(core_state);
1522
1523     return 0;
1524 }
1525
1526
1527
1528 void palacios_pstate_ctrl_acquire(uint32_t type)
1529 {
1530     if (get_cpu_var(core_state).mode != V3_PSTATE_HOST_CONTROL) { 
1531         put_cpu_var(core_state);
1532         palacios_pstate_ctrl_release();
1533     } else {
1534         put_cpu_var(core_state);
1535     }
1536
1537     switch (type) { 
1538         case V3_PSTATE_EXTERNAL_CONTROL:
1539             switch_to_external();
1540             break;
1541         case V3_PSTATE_DIRECT_CONTROL:
1542             switch_to_direct();
1543             break;
1544         case V3_PSTATE_INTERNAL_CONTROL:
1545             switch_to_internal();
1546             break;
1547         default:
1548             ERROR("Unknown pstate control type %u\n",type);
1549             break;
1550     }
1551
1552 }
1553
1554 // Wrappers for xcalls
1555 static void palacios_pstate_ctrl_acquire_external(void)
1556 {
1557     palacios_pstate_ctrl_acquire(V3_PSTATE_EXTERNAL_CONTROL);
1558 }
1559
1560 static void palacios_pstate_ctrl_acquire_direct(void)
1561 {
1562     palacios_pstate_ctrl_acquire(V3_PSTATE_DIRECT_CONTROL);
1563 }
1564
1565
1566 void palacios_pstate_ctrl_release(void)
1567 {
1568     if (get_cpu_var(core_state).mode == V3_PSTATE_HOST_CONTROL) { 
1569         put_cpu_var(core_state);
1570         return;
1571     } 
1572     put_cpu_var(core_state);
1573
1574     switch (get_cpu_var(core_state).mode) { 
1575         case V3_PSTATE_EXTERNAL_CONTROL:
1576             put_cpu_var(core_state);
1577             switch_from_external();
1578             break;
1579         case V3_PSTATE_DIRECT_CONTROL:
1580             put_cpu_var(core_state);
1581             switch_from_direct();
1582             break;
1583         case V3_PSTATE_INTERNAL_CONTROL:
1584             put_cpu_var(core_state);
1585             switch_from_internal();
1586             break;
1587         default:
1588             put_cpu_var(core_state);
1589             ERROR("Unknown pstate control type %u\n",core_state.mode);
1590             break;
1591     }
1592 }
1593
1594
1595 static void update_hw_pstate(void *arg)
1596 {
1597     if (machine_state.funcs && machine_state.funcs->get_pstate) {
1598         get_cpu_var(core_state).cur_hw_pstate = machine_state.funcs->get_pstate();
1599         put_cpu_var(core_state);
1600     } else {
1601         get_cpu_var(core_state).cur_hw_pstate = 0;
1602         put_cpu_var(core_state);
1603     }
1604 }
1605
1606
1607 /***************************************************************************
1608   PROC Interface to expose state
1609  ***************************************************************************/
1610
1611 static int pstate_show(struct seq_file * file, void * v)
1612 {
1613     unsigned int cpu;
1614     unsigned int numcpus = num_online_cpus();
1615
1616     seq_printf(file, "V3VEE DVFS Status\n\n");
1617
1618     for (cpu=0;cpu<numcpus;cpu++) { 
1619         palacios_xcall(cpu,update_hw_pstate,0);
1620     }
1621
1622     for (cpu=0;cpu<numcpus;cpu++) { 
1623         struct pstate_core_info *s = &per_cpu(core_state,cpu);
1624         seq_printf(file,"pcore %u: hw pstate 0x%llx mode %s ",cpu,
1625                 s->cur_hw_pstate,
1626                 s->mode==V3_PSTATE_HOST_CONTROL ? "host" :
1627                 s->mode==V3_PSTATE_EXTERNAL_CONTROL ? "external" :
1628                 s->mode==V3_PSTATE_DIRECT_CONTROL ? "direct" : 
1629                 s->mode==V3_PSTATE_INTERNAL_CONTROL ? "internal" : "UNKNOWN");
1630         if (s->mode==V3_PSTATE_EXTERNAL_CONTROL) { 
1631             seq_printf(file,"(min=%llu max=%llu cur=%llu) ", s->min_freq_khz, s->max_freq_khz, s->cur_freq_khz);
1632         } 
1633         if (s->mode==V3_PSTATE_DIRECT_CONTROL) { 
1634             seq_printf(file,"(min=%llu max=%llu cur=%llu) ",s->min_pstate, s->max_pstate, s->cur_pstate);
1635         }
1636         seq_printf(file,"\n");
1637     }
1638     return 0;
1639 }
1640
1641 static int pstate_open(struct inode * inode, struct file * file) 
1642 {
1643     return single_open(file, pstate_show, NULL);
1644 }
1645
1646
1647 static struct file_operations pstate_fops = {
1648     .owner = THIS_MODULE,
1649     .open = pstate_open, 
1650     .read = seq_read,
1651     .llseek = seq_lseek,
1652     .release = seq_release
1653 };
1654
1655 static int pstate_hw_show(struct seq_file * file, void * v)
1656 {
1657     int numstates;
1658
1659     seq_printf(file, "V3VEE DVFS Hardware Info\n(all logical cores assumed identical)\n\n");
1660
1661     seq_printf(file, "Arch:   \t%s\n"
1662                      "PStates:\t%s\n\n",
1663             machine_state.arch==INTEL ? "Intel" : 
1664             machine_state.arch==AMD ? "AMD" : "Other",
1665             machine_state.supports_pstates ? "Yes" : "No");
1666
1667
1668 #define YN(x) ((x) ? "Y" : "N")
1669
1670     if (machine_state.arch==INTEL) {
1671         seq_printf(file,"SpeedStep:           \t%s\n",YN(machine_state.have_speedstep));
1672         seq_printf(file,"APERF/MPERF:         \t%s\n",YN(machine_state.have_pstate_hw_coord));
1673         seq_printf(file,"IDA or TurboCore:    \t%s\n",YN(machine_state.have_opportunistic));
1674         seq_printf(file,"Policy Hint:         \t%s\n",YN(machine_state.have_policy_hint));
1675         seq_printf(file,"Hardware Policy:     \t%s\n",YN(machine_state.have_hwp));
1676         seq_printf(file,"Hardware Duty Cycle: \t%s\n",YN(machine_state.have_hdc));
1677         seq_printf(file,"MWAIT extensions:    \t%s\n",YN(machine_state.have_mwait_ext));
1678         seq_printf(file,"MWAIT wake on intr:  \t%s\n",YN(machine_state.have_mwait_int));
1679     } 
1680
1681     if (machine_state.arch==AMD) { 
1682         seq_printf(file,"PState:              \t%s\n",YN(machine_state.have_pstate));
1683         seq_printf(file,"APERF/MPERF:         \t%s\n",YN(machine_state.have_pstate_hw_coord));
1684         seq_printf(file,"CoreBoost:           \t%s\n",YN(machine_state.have_coreboost));
1685         seq_printf(file,"Feedback:            \t%s\n",YN(machine_state.have_feedback));
1686     }
1687
1688
1689     seq_printf(file,"\nPstate\tCtrl\tKHz\tmW\tuS(X)\tuS(B)\n");
1690     numstates = get_cpu_var(processors)->performance->state_count;
1691     if (!numstates) { 
1692         seq_printf(file,"UNKNOWN\n");
1693     } else {
1694         int i;
1695         for (i=0;i<numstates;i++) { 
1696             seq_printf(file,
1697                        "%u\t%llx\t%llu\t%llu\t%llu\t%llu\n",
1698                        i, 
1699                        get_cpu_var(processors)->performance->states[i].control,
1700                        get_cpu_var(processors)->performance->states[i].core_frequency*1000,
1701                        get_cpu_var(processors)->performance->states[i].power,
1702                        get_cpu_var(processors)->performance->states[i].transition_latency,
1703                        get_cpu_var(processors)->performance->states[i].bus_master_latency);
1704         }
1705     }
1706     put_cpu_var(processors);
1707
1708     seq_printf(file,"\nAvailable Modes:");
1709     seq_printf(file," host");
1710     if (get_cpu_var(core_state).have_cpufreq) { 
1711         seq_printf(file," external");
1712     }
1713     put_cpu_var(core_state);
1714     if (machine_state.supports_pstates) {
1715         seq_printf(file," direct");
1716     }
1717     seq_printf(file," internal\n");
1718
1719     return 0;
1720 }
1721
1722 static int pstate_hw_open(struct inode * inode, struct file * file) 
1723 {
1724     return single_open(file, pstate_hw_show, NULL);
1725 }
1726
1727
1728 static struct file_operations pstate_hw_fops = {
1729     .owner = THIS_MODULE,
1730     .open = pstate_hw_open, 
1731     .read = seq_read,
1732     .llseek = seq_lseek,
1733     .release = seq_release
1734 };
1735
1736
1737 int pstate_proc_setup(void)
1738 {
1739     struct proc_dir_entry *proc;
1740     struct proc_dir_entry *prochw;
1741
1742     proc = create_proc_entry("v3-dvfs",0444, palacios_get_procdir());
1743
1744     if (!proc) { 
1745         ERROR("Failed to create proc entry for p-state control\n");
1746         return -1;
1747     }
1748
1749     proc->proc_fops = &pstate_fops;
1750
1751     INFO("/proc/v3vee/v3-dvfs successfully created\n");
1752
1753     prochw = create_proc_entry("v3-dvfs-hw",0444,palacios_get_procdir());
1754
1755
1756     if (!prochw) { 
1757         ERROR("Failed to create proc entry for p-state hw info\n");
1758         return -1;
1759     }
1760
1761     prochw->proc_fops = &pstate_hw_fops;
1762
1763     INFO("/proc/v3vee/v3-dvfs-hw successfully created\n");
1764
1765     return 0;
1766 }
1767
1768 void pstate_proc_teardown(void)
1769 {
1770     remove_proc_entry("v3-dvfs-hw",palacios_get_procdir());
1771     remove_proc_entry("v3-dvfs",palacios_get_procdir());
1772 }
1773
1774 /********************************************************************
1775   User interface (ioctls)
1776  ********************************************************************/
1777
1778 static int dvfs_ctrl(unsigned int cmd, unsigned long arg) 
1779 {
1780     struct v3_dvfs_ctrl_request r;
1781
1782     if (copy_from_user(&r,(void __user*)arg,sizeof(struct v3_dvfs_ctrl_request))) {
1783         ERROR("Failed to copy DVFS request from user\n");
1784         return -EFAULT;
1785     }
1786
1787     if (r.pcore >= num_online_cpus()) {
1788         ERROR("Cannot apply DVFS request to pcore %u\n",r.pcore);
1789         return -EFAULT;
1790     }
1791
1792     switch (r.cmd) {
1793         case V3_DVFS_ACQUIRE: {
1794                                   switch (r.acq_type) { 
1795                                       case V3_DVFS_EXTERNAL:
1796                                           palacios_xcall(r.pcore,(void (*)(void*))palacios_pstate_ctrl_acquire_external, NULL);
1797                                           return 0;
1798                                           break;
1799                                       case V3_DVFS_DIRECT:
1800                                           palacios_xcall(r.pcore,(void (*)(void*))palacios_pstate_ctrl_acquire_direct, NULL);
1801                                           return 0;
1802                                           break;
1803                                       default:
1804                                           ERROR("Unknown DVFS acquire type %u\n",r.acq_type);
1805                                           return -EFAULT;
1806                                   }
1807                               }
1808                               break;
1809         case V3_DVFS_RELEASE: {
1810                                   palacios_xcall(r.pcore,(void (*)(void*))palacios_pstate_ctrl_release, NULL);
1811                                   return 0;
1812                               }
1813                               break;
1814         case V3_DVFS_SETFREQ: {
1815                                   palacios_xcall(r.pcore,(void (*)(void*))palacios_pstate_ctrl_set_freq,(void*)r.freq_khz);
1816                                   return 0;
1817                               }
1818                               break;
1819         case V3_DVFS_SETPSTATE: {
1820                                     palacios_xcall(r.pcore,(void (*)(void*))palacios_pstate_ctrl_set_pstate_wrapper,(void*)(uint64_t)r.pstate);
1821                                     return 0;
1822                                 }
1823         default: {
1824                      ERROR("Unknown DVFS command %u\n",r.cmd);
1825                      return -EFAULT;
1826                  }
1827                  break;
1828     }
1829 }
1830
1831
1832 void pstate_user_setup(void)
1833 {
1834     add_global_ctrl(V3_DVFS_CTRL, dvfs_ctrl);
1835 }
1836
1837
1838 void pstate_user_teardown(void)
1839 {
1840     remove_global_ctrl(V3_DVFS_CTRL);
1841 }
1842
1843 static struct v3_host_pstate_ctrl_iface hooks = {
1844     .get_chars = palacios_pstate_ctrl_get_chars,
1845     .acquire = palacios_pstate_ctrl_acquire,
1846     .release = palacios_pstate_ctrl_release,
1847     .set_pstate = palacios_pstate_ctrl_set_pstate,
1848     .get_pstate = palacios_pstate_ctrl_get_pstate,
1849     .set_freq = palacios_pstate_ctrl_set_freq,
1850     .get_freq = palacios_pstate_ctrl_get_freq,
1851 };
1852
1853
1854
1855 static int pstate_ctrl_init(void) 
1856 {
1857     unsigned int cpu;
1858     unsigned int numcpus = num_online_cpus();
1859
1860     pstate_arch_setup();
1861
1862     for (cpu=0;cpu<numcpus;cpu++) { 
1863         palacios_xcall(cpu,(void ((*)(void*)))init_core,0);
1864     }
1865
1866     V3_Init_Pstate_Ctrl(&hooks);  
1867
1868     if (pstate_proc_setup()) { 
1869         ERROR("Unable to initialize P-State Control\n");
1870         return -1;
1871     }
1872
1873     pstate_user_setup();
1874
1875     pstate_linux_init();
1876
1877     INFO("P-State Control Initialized\n");
1878
1879     return 0;
1880 }
1881
1882 static int pstate_ctrl_deinit(void)
1883 {
1884     unsigned int cpu;
1885     unsigned int numcpus=num_online_cpus();
1886
1887     pstate_linux_deinit();
1888
1889     pstate_user_teardown();
1890
1891     pstate_proc_teardown();
1892
1893     // release pstate control if we have it, and we need to do this on each processor
1894     for (cpu=0;cpu<numcpus;cpu++) { 
1895         palacios_xcall(cpu,(void (*)(void *))deinit_core,0);
1896     }
1897
1898
1899     // Free any mapping table we built for Intel
1900     if (intel_pstate_to_ctrl && intel_pstate_to_ctrl != intel_pstate_to_ctrl_internal) { 
1901         palacios_free(intel_pstate_to_ctrl);
1902     }
1903
1904
1905     return 0;
1906 }
1907
1908
1909 static struct linux_ext pstate_ext = {
1910     .name = "PSTATE_CTRL",
1911     .init = pstate_ctrl_init,
1912     .deinit = pstate_ctrl_deinit,
1913     .guest_init = NULL,
1914     .guest_deinit = NULL,
1915 };
1916
1917
1918 register_extension(&pstate_ext);
1919
1920
1921