Palacios Public Git Repository

To checkout Palacios execute

  git clone http://v3vee.org/palacios/palacios.web/palacios.git
This will give you the master branch. You probably want the devel branch or one of the release branches. To switch to the devel branch, simply execute
  cd palacios
  git checkout --track -b devel origin/devel
The other branches are similar.


9b3e4bb54467c0c54dc332aaa81d7afcf5b217ff
[palacios.git] / linux_module / iface-pstate-ctrl.c
1 /*
2  * This file is part of the Palacios Virtual Machine Monitor developed
3  * by the V3VEE Project with funding from the United States National 
4  * Science Foundation and the Department of Energy.  
5  *
6  * The V3VEE Project is a joint project between Northwestern University
7  * and the University of New Mexico.  You can find out more at 
8  * http://www.v3vee.org
9  *
10  * Copyright (c) 2014, the V3VEE Project <http://www.v3vee.org>
11  * all rights reserved.
12  *
13  * Author: Kyle C. Hale <kh@u.northwestern.edu>
14  *         Shiva Rao <shiva.rao.717@gmail.com>
15  *         Peter Dinda <pdinda@northwestern.edu>
16  *
17  * This is free software.  you are permitted to use,
18  * redistribute, and modify it as specified in the file "V3VEE_LICENSE".
19  */
20
21 #include <linux/uaccess.h>
22 #include <linux/seq_file.h>
23 #include <linux/proc_fs.h>
24 #include <linux/cpufreq.h>
25 #include <linux/kernel.h>
26 #include <linux/kmod.h>
27 #include <linux/module.h>
28 #include <linux/string.h>
29 #include <linux/interrupt.h>
30 #include <asm/processor.h>
31 #include <asm/msr.h>
32 #include <asm/msr-index.h>
33
34 // Used to determine the appropriate pstates values on Intel
35 #include <linux/acpi.h>
36 #include <acpi/processor.h>
37
38 #include <interfaces/vmm_pstate_ctrl.h>
39
40 #include "palacios.h"
41 #include "iface-pstate-ctrl.h"
42
43 #include "linux-exts.h"
44
45 /*
46    This P-STATE control implementation includes the following modes.
47    You can switch between modes at any time.
48
49    - Internal control of processor states in Palacios (handoff from Linux)
50      When Palacios acuires this control, this module disables Linux cpufreq control
51      and allows code within Palacios unfettered access to the DVFS hardware. 
52    - Direct control of Intel and AMD processor pstates using code in this module
53      When you acquire this control, this module disables Linux cpufreq control
54      and directly programs the processor itself in response to your requests
55    - External control of processor states via Linux 
56      When you acuire this control, this module uses the Linux cpufreq control
57      to program the processor on your behelf
58    - Host control of processor stastes
59      This is the normal mode of DVFS control (e.g., Linux cpufreq)
60
61    Additionally, it provides a user-space interface for manipulating
62    p-state regardless of the host's functionality.  This includes
63    an ioctl for commanding the implementation and a /proc file for 
64    showing current status and capabilities.  From user space, you can
65    use the Direct, External, and Host modes.  
66
67    What we mean by "p-state" here is the processor's internal
68    configuration.   For AMD, this is defined as being the same as
69    the ACPI-defined p-state.  For Intel, it is not.  There, it is the 
70    contents of the perf ctl MSR, which is opaque.   We try hard to 
71    provide "p-states" that go from 0...max, by analogy or equivalence
72    to the ACPI p-states. 
73
74 */
75
76
77 #define PALACIOS_GOVNAME "v3vee"
78 #define MAX_PATH_LEN     128
79 #define MAX_GOV_NAME_LEN 16
80
81
82 struct pstate_core_info {
83     // Here we have the notion of host control
84 #define V3_PSTATE_HOST_CONTROL 0
85     // and all the modes from the Palacios interface:
86     // V3_PSTATE_EXTERNAL_CONTROL
87     // V3_PSTATE_DIRECT_CONTROL
88     // V3_PSTATE_INTERNAL_CONTROL
89     uint32_t mode;
90
91     // Apply if we are under the DIRECT state
92     uint64_t cur_pstate;
93     uint64_t max_pstate;
94     uint64_t min_pstate;
95
96     uint64_t cur_hw_pstate;
97
98     // Apply if we are under the EXTERNAL state
99     uint64_t set_freq_khz; // this is the frequency we're hoping to get
100     uint64_t cur_freq_khz;
101     uint64_t max_freq_khz;
102     uint64_t min_freq_khz;
103
104     // Intel-specific
105     uint8_t prior_speedstep;
106     uint8_t turbo_disabled;
107     uint8_t no_turbo;
108
109     int have_cpufreq;
110
111     // This is where we stash Linux's governor when we make a mode switch
112     char * linux_governor;
113     // We have this so we can restore the original frequency when we started
114     uint64_t original_hz; 
115
116 };
117
118
119 static DEFINE_PER_CPU(struct pstate_core_info, core_state);
120
121
122
123 // These are used to assert DIRECT control over the core pstates
124 struct pstate_core_funcs {
125     void    (*arch_init)(void);
126     void    (*arch_deinit)(void);
127     uint64_t (*get_min_pstate)(void);
128     uint64_t (*get_max_pstate)(void);
129     uint64_t (*get_pstate)(void);
130     void    (*set_pstate)(uint64_t pstate);
131 };
132
133 struct pstate_machine_info {
134     enum {INTEL, AMD, OTHER } arch;
135     int supports_pstates;
136
137
138     // For AMD
139     int have_pstate;
140     int have_coreboost;
141     int have_feedback;  
142
143     // For Intel
144     int have_speedstep;
145     int have_opportunistic; // this means "Turbo Boost" or "IDA"
146     int have_policy_hint;
147     int have_hwp;       // hardware-controlled performance states
148     int have_hdc;       // hardware duty cycling
149     int have_mwait_ext; // mwait power extensions
150     int have_mwait_int; // mwait wakes on interrupt
151
152     // for both
153     int have_pstate_hw_coord;  // mperf/aperf
154
155     // used for DIRECT control
156     struct pstate_core_funcs *funcs;
157
158 };
159
160 static struct pstate_machine_info machine_state;
161
162
163 /****************************************************
164   AMD  DIRECT CONTROL
165  ***************************************************/
166
167 /* AMD Programmer's Manual Vol 2 (Rev 3, 2013), Sec. 17.1, pp.557 */
168 #define MSR_PSTATE_LIMIT_REG_AMD 0xc0010061
169 #define MSR_PSTATE_CTL_REG_AMD   0xc0010062
170 #define MSR_PSTATE_STAT_REG_AMD  0xc0010063
171
172 struct p_state_limit_reg_amd {
173     union {
174         uint64_t val;
175         struct {
176             uint8_t  pstate_limit : 4; /* lowest P-state value (highest perf.) supported currently (this can change at runtime) */
177             uint8_t  pstate_max   : 4; /* highest P-state value supported  (lowest perf) */
178             uint64_t rsvd         : 56;
179         } reg;
180     } __attribute__((packed));
181 } __attribute__((packed));
182
183
184 struct p_state_stat_reg_amd {
185     union {
186         uint64_t val;
187         struct {
188             uint8_t  pstate  : 4;
189             uint64_t rsvd    : 60;
190         } reg;
191     } __attribute__((packed));
192 } __attribute__((packed));
193
194
195 struct p_state_ctl_reg_amd {
196     union {
197         uint64_t val;
198         struct {
199             uint8_t  cmd  : 4;
200             uint64_t rsvd : 60;
201         } reg;
202     } __attribute__((packed));
203 } __attribute__((packed));
204
205
206 /* CPUID Fn8000_0007_EDX[HwPstate(7)] = 1 */
207 static uint8_t supports_pstates_amd (void)
208 {
209     int i;
210     int mapwrong=0;
211     int amd_num_pstates;
212
213     uint32_t eax, ebx, ecx, edx;
214
215     cpuid(0x80000007, &eax, &ebx, &ecx, &edx);
216     machine_state.have_pstate = !!(edx & (1 << 7));
217     machine_state.have_coreboost = !!(edx & (1<<9));
218     machine_state.have_feedback = !!(edx & (1<<11));
219
220     cpuid(0x6, &eax, &ebx, &ecx, &edx);
221     machine_state.have_pstate_hw_coord =  !!(ecx & 1); 
222
223     INFO("P-State: AMD: Pstates=%d Coreboost=%d Feedback=%d PstateHWCoord=%d\n",
224             machine_state.have_pstate, 
225             machine_state.have_coreboost, 
226             machine_state.have_feedback,
227             machine_state.have_pstate_hw_coord);
228
229     amd_num_pstates = get_cpu_var(processors)->performance->state_count;
230     if (amd_num_pstates) { 
231         for (i=0;i<amd_num_pstates;i++) { 
232             INFO("P-State: %u: freq=%llu ctrl=%llx%s\n",
233                  i, 
234                  get_cpu_var(processors)->performance->states[i].core_frequency*1000,
235                  get_cpu_var(processors)->performance->states[i].control,
236                  get_cpu_var(processors)->performance->states[i].control != i ? (mapwrong=1, " ALERT - CTRL MAPPING NOT 1:1") : "");
237         }
238     }
239     if (mapwrong) { 
240         ERROR("P-State: AMD: mapping of pstate and control is not 1:1 on this processor - we will probably not work corrrectly\n");
241     }
242
243     return machine_state.have_pstate;
244
245
246 }
247
248
249 static void init_arch_amd(void)
250 {
251     /* KCH: nothing to do here */
252 }
253
254
255 static void deinit_arch_amd(void)
256 {
257     /* KCH: nothing to do here */
258 }
259
260
261 static uint64_t get_pstate_amd(void) 
262 {
263     struct p_state_stat_reg_amd pstat;
264
265     rdmsrl(MSR_PSTATE_STAT_REG_AMD, pstat.val);
266
267     get_cpu_var(core_state).cur_pstate=pstat.reg.pstate;
268     put_cpu_var(core_state);
269
270     return pstat.reg.pstate;
271 }
272
273
274 static void set_pstate_amd(uint64_t p)
275 {
276     struct p_state_ctl_reg_amd pctl;
277
278     if (p>get_cpu_var(core_state).max_pstate) { 
279         p=get_cpu_var(core_state).max_pstate;
280     }
281     put_cpu_var(core_state);
282
283     pctl.val = 0;
284     pctl.reg.cmd = p;
285
286     wrmsrl(MSR_PSTATE_CTL_REG_AMD, pctl.val);
287
288     get_cpu_var(core_state).cur_pstate=p;
289     put_cpu_var(core_state);
290 }
291
292
293 /*
294  * NOTE: HW may change this value at runtime
295  */
296 static uint64_t get_max_pstate_amd(void)
297 {
298     struct p_state_limit_reg_amd plimits;
299
300     rdmsrl(MSR_PSTATE_LIMIT_REG_AMD, plimits.val);
301
302     return plimits.reg.pstate_max;
303 }
304
305
306 static uint64_t get_min_pstate_amd(void)
307 {
308     struct p_state_limit_reg_amd plimits;
309
310     rdmsrl(MSR_PSTATE_LIMIT_REG_AMD, plimits.val);
311
312     return plimits.reg.pstate_limit;
313 }
314
315
316 static struct pstate_core_funcs amd_funcs =
317 {
318     .arch_init        = init_arch_amd,
319     .arch_deinit      = deinit_arch_amd,
320     .get_pstate       = get_pstate_amd,
321     .set_pstate       = set_pstate_amd,
322     .get_max_pstate   = get_max_pstate_amd,
323     .get_min_pstate   = get_min_pstate_amd,
324 };
325
326
327
328 /***********************************************************
329   INTEL DIRECT CONTROL
330  **********************************************************/
331
332
333 /*
334    This implementation uses SpeedStep, but does check
335    to see if the other features (MPERF/APERF, Turbo/IDA, HWP)
336    are available.
337 */
338
339 /* Intel System Programmer's Manual Vol. 3B, 14-2 */
340 #define MSR_MPERF_IA32         0x000000e7
341 #define MSR_APERF_IA32         0x000000e8
342 #define MSR_MISC_ENABLE_IA32   0x000001a0
343 #define MSR_NHM_TURBO_RATIO_LIMIT   0x000001ad
344 #define MSR_PLATFORM_INFO_IA32 0x000000ce
345 #define MSR_PERF_CTL_IA32      0x00000199
346 #define MSR_PERF_STAT_IA32     0x00000198
347 #define MSR_ENERY_PERF_BIAS_IA32 0x000001b0
348
349
350 /* Note that the actual  meaning of the pstate
351    in the control and status registers is actually
352    implementation dependent, unlike AMD.   The "official"
353    way to figure it out the mapping from pstate to 
354    these values is via ACPI.  What is written in the register
355    is an "id" of an operation point
356
357    "Often", the 16 bit field consists of a high order byte
358    which is the frequency (the multiplier) and the low order
359    byte is the voltage. 
360    */
361 // MSR_PERF_CTL_IA32  r/w
362 struct perf_ctl_reg_intel {
363     union {
364         uint64_t val;
365         struct {
366             // This is the target
367             // Note, not the ACPI pstate, but
368             // Intel's notion of pstate is that it's opaque
369             // for lots of implementations it seems to be
370             // frequency_id : voltage_id
371             // where frequency_id is typically the multiplier
372             uint16_t pstate                 : 16;
373             uint16_t reserved               : 16;
374             // set to 1 to *disengage* dynamic acceleration
375             // Note that "IDA" and "Turbo" use the same interface
376             uint16_t dynamic_accel_disable  : 1;
377             uint32_t reserved2              : 31;
378         } reg;
379     } __attribute__((packed));
380 } __attribute__((packed));
381
382 // MSR_PERF_STAT_IA32 r
383 struct perf_stat_reg_intel {
384     union {
385         uint64_t val;
386         struct {
387             // this is the current
388             uint16_t pstate                 : 16;
389             uint64_t reserved               : 48;
390         } reg;
391     } __attribute__((packed));
392 } __attribute__((packed));
393
394 // MSR_ENERGY_PERF_BIAS_IA32 r/w
395 struct enery_perf_bias_reg_intel {
396     union {
397         uint64_t val;
398         struct {
399             // this is the current
400             uint8_t  policy_hint            : 4;
401             uint64_t reserved               : 60;
402         } reg;
403     } __attribute__((packed));
404 } __attribute__((packed));
405
406 // MSR_PLATFORM_INFO
407 struct turbo_mode_info_reg_intel {
408     union {
409         uint64_t val;
410         struct {
411             uint8_t  rsvd0                  : 8;
412             uint8_t  max_noturbo_ratio      : 8;
413             uint8_t  rsvd1                  : 7;
414             uint8_t  ppin_cap               : 1;
415             uint8_t  rsvd2                  : 4;
416             uint8_t  ratio_limit            : 1; 
417             uint8_t  tdc_tdp_limit          : 1;
418             uint16_t rsvd3                  : 10;
419             uint8_t  min_ratio              : 8;
420             uint16_t rsvd4                  : 16;
421         } reg;
422     } __attribute__((packed));
423 } __attribute__((packed));
424
425 // This replicates the critical information in Linux's struct acpi_processor_px
426 // To make it easier to port to other OSes.    
427 struct intel_pstate_info {
428     uint64_t freq;  // KHz
429     uint64_t ctrl;  // What to write into the _CTL MSR to get this
430 };
431
432 // The internal array will be used if we cannot build the table locally
433 static struct intel_pstate_info *intel_pstate_to_ctrl_internal=0;
434 static int intel_num_pstates_internal=0;
435
436 // These will either point to the internal array or to a constructed array
437 static struct intel_pstate_info *intel_pstate_to_ctrl=0;
438 static int intel_num_pstates=0;
439
440
441 /* CPUID.01:ECX.AES(7) */
442 static uint8_t supports_pstates_intel(void)
443 {
444     /* NOTE: CPUID.06H:ECX.SETBH[bit 3] is set and it also implies the presence of a new architectural MSR called IA32_ENERGY_PERF_BIAS (1B0H).
445     */
446     uint32_t eax, ebx, ecx, edx;
447
448     cpuid(0x1, &eax, &ebx, &ecx, &edx);
449     machine_state.have_speedstep =  !!(ecx & (1 << 7));
450
451     cpuid(0x6, &eax, &ebx, &ecx, &edx);
452     machine_state.have_pstate_hw_coord =  !!(ecx & 1); // ?
453     machine_state.have_opportunistic =  !!(eax & 1<<1);
454     machine_state.have_policy_hint = !!(ecx & 1<<3);
455     machine_state.have_hwp = !!(eax & 1<<7);
456     machine_state.have_hdc = !!(eax & 1<<13);
457
458     cpuid(0x5, &eax, &ebx, &ecx, &edx);
459     machine_state.have_mwait_ext =  !!(ecx & 1);
460     machine_state.have_mwait_int =  !!(ecx & 1<<1);
461
462
463     // Note we test all the available hardware features documented as of August 2014
464     // We are only currently using speed_step, however.
465
466     INFO("P-State: Intel: Speedstep=%d, PstateHWCoord=%d, Opportunistic=%d PolicyHint=%d HWP=%d HDC=%d, MwaitExt=%d MwaitInt=%d \n",
467             machine_state.have_speedstep, 
468             machine_state.have_pstate_hw_coord, 
469             machine_state.have_opportunistic,
470             machine_state.have_policy_hint,
471             machine_state.have_hwp,
472             machine_state.have_hdc,
473             machine_state.have_mwait_ext,
474             machine_state.have_mwait_int );
475
476
477     if (machine_state.have_speedstep) {
478         uint32_t i;
479         // Build mapping table (from "pstate" (0..) to ctrl value for MSR
480         if (!(get_cpu_var(processors)) || !(get_cpu_var(processors)->performance) ) { 
481             put_cpu_var(processors);
482             // no acpi...  revert to internal table
483             intel_pstate_to_ctrl=intel_pstate_to_ctrl_internal;
484             intel_num_pstates=intel_num_pstates_internal;
485         } else {
486             intel_num_pstates = get_cpu_var(processors)->performance->state_count;
487             if (intel_num_pstates) { 
488                 intel_pstate_to_ctrl = palacios_alloc(sizeof(struct intel_pstate_info)*intel_num_pstates);
489                 if (!intel_pstate_to_ctrl) { 
490                     ERROR("P-State: Cannot allocate space for mapping...\n");
491                     intel_num_pstates=0;
492                 }
493                 for (i=0;i<intel_num_pstates;i++) { 
494                     intel_pstate_to_ctrl[i].freq = get_cpu_var(processors)->performance->states[i].core_frequency*1000;
495                     intel_pstate_to_ctrl[i].ctrl = get_cpu_var(processors)->performance->states[i].control;
496                 }
497                     
498             } else {
499                 ERROR("P-State: Strange, machine has ACPI DVFS but no states...\n");
500             }
501         }
502         put_cpu_var(processors);
503         INFO("P-State: Intel - State Mapping (%u states) follows\n",intel_num_pstates);
504         for (i=0;i<intel_num_pstates;i++) {
505             INFO("P-State: Intel Mapping %u:  freq=%llu  ctrl=%llx\n",
506                  i, intel_pstate_to_ctrl[i].freq,intel_pstate_to_ctrl[i].ctrl);
507         }
508     } else {
509         INFO("P-State: Intel:  No speedstep here\n");
510     }
511         
512
513     return machine_state.have_speedstep;
514 }
515
516
517 static void init_arch_intel(void)
518 {
519     uint64_t val;
520
521     rdmsrl(MSR_MISC_ENABLE_IA32, val);
522
523     //INFO("P-State: prior ENABLE=%llx\n",val);
524
525     // store prior speedstep setting
526     get_cpu_var(core_state).prior_speedstep=(val >> 16) & 0x1;
527     put_cpu_var(core_state);
528
529     // enable speedstep (probably already on)
530     val |= 1 << 16;
531     wrmsrl(MSR_MISC_ENABLE_IA32, val);
532
533     //INFO("P-State: write ENABLE=%llx\n",val);
534
535 }
536
537 static void deinit_arch_intel(void)
538 {
539     uint64_t val;
540
541     rdmsrl(MSR_MISC_ENABLE_IA32, val);
542
543     //INFO("P-State: deinit: ENABLE=%llx\n",val);
544
545     val &= ~(1ULL << 16);
546     val |= get_cpu_var(core_state).prior_speedstep << 16;
547     put_cpu_var(core_state);
548
549     wrmsrl(MSR_MISC_ENABLE_IA32, val);
550
551     //INFO("P-state: deinit ENABLE=%llx\n",val);
552
553 }
554
555 /* TODO: Intel P-states require sampling at intervals... */
556 static uint64_t get_pstate_intel(void)
557 {
558     uint64_t val;
559
560     rdmsrl(MSR_PERF_STAT_IA32,val);
561
562     //INFO("P-State: Get: 0x%llx\n", val);
563
564     // should check if turbo is active, in which case 
565     // this value is not the whole story
566
567     return val;
568 }
569
570 static void set_pstate_intel(uint64_t p)
571 {
572     uint64_t val;
573     uint64_t ctrl;
574
575     if (intel_num_pstates==0) { 
576         return ;
577     } else {
578         if (p>=intel_num_pstates) { 
579             p=intel_num_pstates-1;
580         }
581     }
582
583     ctrl=intel_pstate_to_ctrl[p].ctrl;
584
585     /* ...Intel IDA (dynamic acceleration)
586        if (c->no_turbo && !c->turbo_disabled) {
587        val |= 1 << 32;
588        }
589        */
590     // leave all bits along expect for the likely
591     // fid bits
592
593     rdmsrl(MSR_PERF_CTL_IA32, val);
594     //INFO("P-State: Pre-Set: 0x%llx\n", val);
595
596     val &= ~0xffffULL;
597     val |= ctrl & 0xffffULL;
598
599     //INFO("P-State: Set: 0x%llx\n", val);
600
601     wrmsrl(MSR_PERF_CTL_IA32, val);
602
603     get_cpu_var(core_state).cur_pstate = p;
604     put_cpu_var(core_state);
605 }
606
607
608 static uint64_t get_min_pstate_intel(void)
609 {
610     return 0;
611 }
612
613
614
615 static uint64_t get_max_pstate_intel (void)
616 {
617     if (intel_num_pstates==0) { 
618         return 0;
619     } else {
620         return intel_num_pstates-1;
621     }
622 }
623
624 static struct pstate_core_funcs intel_funcs =
625 {
626     .arch_init        = init_arch_intel,
627     .arch_deinit      = deinit_arch_intel,
628     .get_pstate       = get_pstate_intel,
629     .set_pstate       = set_pstate_intel,
630     .get_max_pstate   = get_max_pstate_intel,
631     .get_min_pstate   = get_min_pstate_intel,
632 };
633
634
635
636 /***********************************************
637   Arch determination and setup
638  ***********************************************/
639
640 static inline void cpuid_string (uint32_t id, uint32_t dest[4]) 
641 {
642     asm volatile("cpuid"
643             :"=a"(*dest),"=b"(*(dest+1)),"=c"(*(dest+2)),"=d"(*(dest+3))
644             :"a"(id));
645 }
646
647
648 static int get_cpu_vendor (char name[13])
649 {
650     uint32_t dest[4];
651     uint32_t maxid;
652
653     cpuid_string(0,dest);
654     maxid=dest[0];
655     ((uint32_t*)name)[0]=dest[1];
656     ((uint32_t*)name)[1]=dest[3];
657     ((uint32_t*)name)[2]=dest[2];
658     name[12]=0;
659
660     return maxid;
661 }
662
663
664 static int is_intel (void)
665 {
666     char name[13];
667     get_cpu_vendor(name);
668     return !strcmp(name,"GenuineIntel");
669 }
670
671
672 static int is_amd (void)
673 {
674     char name[13];
675     get_cpu_vendor(name);
676     return !strcmp(name,"AuthenticAMD");
677 }
678
679 static int pstate_arch_setup(void)
680 {
681
682     if (is_amd()) {
683         machine_state.arch = AMD;
684         machine_state.funcs = &amd_funcs;
685         machine_state.supports_pstates = supports_pstates_amd();
686         INFO("PSTATE: P-State initialized for AMD\n");
687     } else if (is_intel()) {
688         machine_state.arch  = INTEL;
689         machine_state.funcs = &intel_funcs;
690         machine_state.supports_pstates = supports_pstates_intel();
691         INFO("PSTATE: P-State initialized for INTEL (Work in progress...)\n");
692         return 0;
693
694     } else {
695         machine_state.arch = OTHER;
696         machine_state.funcs = NULL;
697         machine_state.supports_pstates = 0;
698         INFO("PSTATE: P-state control: No support for direct control on this architecture\n");
699         return 0;
700     }
701
702     return 0;
703 }
704
705
706
707 /******************************************************************
708   Linux Interface
709  *****************************************************************/
710
711 static unsigned cpus_using_v3_governor;
712 static DEFINE_MUTEX(v3_governor_mutex);
713
714 /* KCH: this will tell us when there is an actual frequency transition */
715 static int v3_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
716         void *data)
717 {
718     struct cpufreq_freqs *freq = data;
719
720     if (per_cpu(core_state, freq->cpu).mode != V3_PSTATE_EXTERNAL_CONTROL) {
721         return 0;
722     }
723
724     if (val == CPUFREQ_POSTCHANGE) {
725         DEBUG("P-State: frequency change took effect on cpu %u (now %u kHz)\n",
726                 freq->cpu, freq->new);
727         per_cpu(core_state, freq->cpu).cur_freq_khz = freq->new;
728     }
729
730     return 0;
731
732 }
733
734
735 static struct notifier_block v3_cpufreq_notifier_block = {
736     .notifier_call = v3_cpufreq_notifier
737 };
738
739
740 /* 
741  * This stub governor is simply a placeholder for preventing 
742  * frequency changes from the Linux side. For now, we simply leave
743  * the frequency as is when we acquire control. 
744  */
745 static int governor_run(struct cpufreq_policy *policy, unsigned int event)
746 {
747     unsigned cpu = policy->cpu;
748
749     switch (event) {
750         /* we can't use cpufreq_driver_target here as it can result
751          * in a circular dependency, so we'll keep the current frequency as is
752          */
753         case CPUFREQ_GOV_START:
754             BUG_ON(!policy->cur);
755
756             mutex_lock(&v3_governor_mutex);
757
758             if (cpus_using_v3_governor == 0) {
759                 cpufreq_register_notifier(&v3_cpufreq_notifier_block,
760                         CPUFREQ_TRANSITION_NOTIFIER);
761             }
762
763             cpus_using_v3_governor++;
764
765             per_cpu(core_state, cpu).set_freq_khz = policy->cur;
766             per_cpu(core_state, cpu).cur_freq_khz = policy->cur;
767             per_cpu(core_state, cpu).max_freq_khz = policy->max;
768             per_cpu(core_state, cpu).min_freq_khz = policy->min;
769
770             mutex_unlock(&v3_governor_mutex);
771             break;
772         case CPUFREQ_GOV_STOP:
773             mutex_lock(&v3_governor_mutex);
774
775             cpus_using_v3_governor--;
776
777             if (cpus_using_v3_governor == 0) {
778                 cpufreq_unregister_notifier(
779                         &v3_cpufreq_notifier_block,
780                         CPUFREQ_TRANSITION_NOTIFIER);
781             }
782
783             per_cpu(core_state, cpu).set_freq_khz = 0;
784             per_cpu(core_state, cpu).cur_freq_khz = 0;
785             per_cpu(core_state, cpu).max_freq_khz = 0;
786             per_cpu(core_state, cpu).min_freq_khz = 0;
787
788             mutex_unlock(&v3_governor_mutex);
789             break;
790         case CPUFREQ_GOV_LIMITS:
791             /* do nothing */
792             break;
793         default:
794             ERROR("Undefined governor command (%u)\n", event);
795             return -1;
796     }                           
797
798     return 0;
799 }
800
801
802 static struct cpufreq_governor stub_governor = 
803 {
804     .name = PALACIOS_GOVNAME,
805     .governor = governor_run,
806     .owner = THIS_MODULE,
807 };
808
809
810 static struct workqueue_struct *pstate_wq;
811
812 typedef struct {
813     struct work_struct work;
814     uint64_t freq;
815 } pstate_work_t;
816
817
818
819 static inline void pstate_register_linux_governor(void)
820 {
821     cpufreq_register_governor(&stub_governor);
822 }
823
824
825 static inline void pstate_unregister_linux_governor(void)
826 {
827     cpufreq_unregister_governor(&stub_governor);
828 }
829
830
831 static int pstate_linux_init(void)
832 {
833     pstate_register_linux_governor();
834     pstate_wq = create_workqueue("v3vee_pstate_wq");
835     if (!pstate_wq) {
836         ERROR("Could not create work queue\n");
837         goto out_err;
838     }
839
840     return 0;
841
842 out_err:
843     pstate_unregister_linux_governor();
844     return -1;
845 }
846
847
848 static void pstate_linux_deinit(void)
849 {
850     pstate_unregister_linux_governor();
851     flush_workqueue(pstate_wq);
852     destroy_workqueue(pstate_wq);
853 }
854
855
856 static int get_current_governor(char **buf, unsigned int cpu)
857 {
858     struct cpufreq_policy * policy = palacios_alloc(sizeof(struct cpufreq_policy));
859     char * govname = NULL;
860
861     if (!policy) {
862         ERROR("could not allocate cpufreq_policy\n");
863         return -1;
864     }
865         
866     if (cpufreq_get_policy(policy, cpu) != 0) {
867         ERROR("Could not get current cpufreq policy\n");
868         goto out_err;
869     }
870
871     /* We're in interrupt context, should probably not wait here */
872     govname = palacios_alloc(MAX_GOV_NAME_LEN);
873     if (!govname) {
874         ERROR("Could not allocate space for governor name\n");
875         goto out_err;
876     }
877
878     strncpy(govname, policy->governor->name, MAX_GOV_NAME_LEN);
879     govname[MAX_GOV_NAME_LEN-1] = 0;
880
881     get_cpu_var(core_state).linux_governor = govname;
882     put_cpu_var(core_state);
883
884     *buf = govname;
885
886     palacios_free(policy);
887
888     return 0;
889
890 out_err:
891     palacios_free(policy);
892     return -1;
893 }
894
895
896 /* passed to the userspacehelper interface for cleanup */
897 static void gov_switch_cleanup(struct subprocess_info * s)
898 {
899     palacios_free(s->argv[2]);
900     palacios_free(s->argv);
901 }
902
903
904 /* 
905  * Switch governors
906  * @s - the governor to switch to 
907  * TODO: this should probably be submitted to a work queue
908  * so we don't have to run it in interrupt context
909  */
910 static int governor_switch(char * s, unsigned int cpu)
911 {
912     char * path_str = NULL;
913     char ** argv = NULL; 
914
915     static char * envp[] = {
916         "HOME=/",
917         "TERM=linux",
918         "PATH=/sbin:/bin:/usr/sbin:/usr/bin", NULL };
919
920
921     argv = palacios_alloc(4*sizeof(char*));
922     if (!argv) {
923         ERROR("Couldn't allocate argv struct\n");
924         return -1;
925     }
926
927     path_str = palacios_alloc(MAX_PATH_LEN);
928     if (!path_str) {
929         ERROR("Couldn't allocate path string\n");
930         goto out_freeargv;
931     }
932     memset(path_str, 0, MAX_PATH_LEN);
933
934     snprintf(path_str, MAX_PATH_LEN, "echo %s > /sys/devices/system/cpu/cpu%u/cpufreq/scaling_governor", s, cpu);
935
936     argv[0] = "/bin/sh";
937     argv[1] = "-c";
938     argv[2] = path_str;
939     argv[3] = NULL;
940
941     /* KCH: we can't wait here to actually see if we succeeded, we're in interrupt context */
942     return call_usermodehelper_fns("/bin/sh", argv, envp, UMH_NO_WAIT, NULL, gov_switch_cleanup, NULL);
943
944 out_freeargv:
945     palacios_free(argv);
946     return -1;
947 }
948
949
950 static inline void free_linux_governor(void)
951 {
952     palacios_free(get_cpu_var(core_state).linux_governor);
953     put_cpu_var(core_state);
954 }
955
956
957 static int linux_setup_palacios_governor(void)
958 {
959     char * gov;
960     unsigned int cpu = get_cpu();
961     put_cpu();
962
963     /* KCH:  we assume the v3vee governor is already 
964      * registered with kernel by this point 
965      */
966
967     if (get_current_governor(&gov, cpu) < 0) {
968         ERROR("Could not get current governor\n");
969         return -1;
970     }
971
972     DEBUG("saving current governor (%s)\n", gov);
973
974     get_cpu_var(core_state).linux_governor = gov;
975     put_cpu_var(core_state);
976     
977     DEBUG("setting the new governor (%s)\n", PALACIOS_GOVNAME);
978
979     /* set the new one to ours */
980
981     if (governor_switch(PALACIOS_GOVNAME, cpu) < 0) {
982         ERROR("Could not set governor to (%s)\n", PALACIOS_GOVNAME);
983         return -1;
984     }
985
986     return 0;
987 }
988
989
990
991 static uint64_t linux_get_pstate(void)
992 {
993     struct cpufreq_policy * policy = NULL;
994     struct cpufreq_frequency_table *table;
995     unsigned int i = 0;
996     unsigned int count = 0;
997     unsigned int cpu = get_cpu(); 
998     put_cpu();
999
1000
1001     policy = palacios_alloc(sizeof(struct cpufreq_policy));
1002     if (!policy) {
1003         ERROR("Could not allocate policy struct\n");
1004         return -1;
1005     }
1006
1007     cpufreq_get_policy(policy, cpu);
1008     table = cpufreq_frequency_get_table(cpu);
1009
1010     for (i = 0; (table[i].frequency != CPUFREQ_TABLE_END); i++) {
1011
1012         if (table[i].frequency == CPUFREQ_ENTRY_INVALID) {
1013             continue;
1014         }
1015
1016         if (table[i].frequency == policy->cur) {
1017             break;
1018         }
1019
1020         count++;
1021     }
1022
1023     palacios_free(policy);
1024
1025     put_cpu();
1026     return count;
1027 }
1028
1029
1030 static uint64_t linux_get_freq(void)
1031 {
1032     uint64_t freq;
1033     struct cpufreq_policy * policy = NULL;
1034     unsigned int cpu = get_cpu();
1035     put_cpu();
1036
1037     policy = palacios_alloc(sizeof(struct cpufreq_policy));
1038     if (!policy) {
1039         ERROR("Could not allocate policy struct\n");
1040         return -1;
1041     }
1042
1043     if (cpufreq_get_policy(policy, cpu)) {
1044         ERROR("Could not get current policy\n");
1045         return -1;
1046     }
1047
1048     freq=policy->cur;
1049
1050     palacios_free(policy);
1051
1052     return freq;
1053 }
1054
1055 static void  
1056 pstate_switch_workfn (struct work_struct *work)
1057 {
1058     pstate_work_t * pwork = (pstate_work_t*)work;
1059     struct cpufreq_policy * policy = NULL;
1060     uint64_t freq; 
1061     unsigned int cpu = get_cpu();
1062     put_cpu();
1063
1064     mutex_lock(&v3_governor_mutex);
1065
1066     policy = palacios_alloc(sizeof(struct cpufreq_policy));
1067     if (!policy) {
1068         ERROR("Could not allocate space for cpufreq policy\n");
1069         goto out;
1070     }
1071
1072     if (cpufreq_get_policy(policy, cpu) != 0) {
1073         ERROR("Could not get cpufreq policy\n");
1074         goto out1;
1075     }
1076
1077     freq = pwork->freq;
1078     get_cpu_var(core_state).set_freq_khz = freq;
1079
1080     if (freq < get_cpu_var(core_state).min_freq_khz) {
1081         freq = get_cpu_var(core_state).min_freq_khz;
1082     }
1083     if (freq > get_cpu_var(core_state).max_freq_khz) {
1084         freq = get_cpu_var(core_state).max_freq_khz;
1085     }
1086     put_cpu_var(core_state);
1087
1088     INFO("P-state: requesting frequency change on core %u to %llu\n", cpu, freq);
1089     __cpufreq_driver_target(policy, freq, CPUFREQ_RELATION_L);
1090
1091 out1:
1092     palacios_free(policy);
1093 out:
1094     palacios_free(work);
1095     mutex_unlock(&v3_governor_mutex);
1096
1097
1098
1099 static int linux_set_pstate(uint64_t p)
1100 {
1101     struct cpufreq_policy * policy = NULL;
1102     struct cpufreq_frequency_table *table;
1103     pstate_work_t * work = NULL;
1104     unsigned int i = 0;
1105     unsigned int count = 0;
1106     int state_set = 0;
1107     int last_valid = 0;
1108     unsigned int cpu = get_cpu();
1109     put_cpu();
1110
1111     policy = palacios_alloc(sizeof(struct cpufreq_policy));
1112     if (!policy) {
1113         ERROR("Could not allocate policy struct\n");
1114         return -1;
1115     }
1116
1117     work = (pstate_work_t*)palacios_alloc(sizeof(pstate_work_t));
1118     if (!work) {
1119         ERROR("Could not allocate work struct\n");
1120         goto out_err;
1121     }
1122
1123     if (cpufreq_get_policy(policy, cpu)) {
1124         ERROR("Could not get current policy\n");
1125         goto out_err1;
1126     }
1127     table = cpufreq_frequency_get_table(cpu);
1128
1129     for (i = 0; (table[i].frequency != CPUFREQ_TABLE_END); i++) {
1130
1131         if (table[i].frequency == CPUFREQ_ENTRY_INVALID) {
1132             continue;
1133         }
1134
1135         if (count == p) {
1136
1137             INIT_WORK((struct work_struct*)work, pstate_switch_workfn);
1138             work->freq = table[i].frequency;
1139             queue_work(pstate_wq, (struct work_struct*)work);
1140
1141             state_set = 1;
1142             break;
1143         }
1144
1145         count++;
1146         last_valid = i;
1147     }
1148
1149     /* we need to deal with the case in which we get a number > max pstate */
1150     if (!state_set) {
1151         INIT_WORK((struct work_struct*)work, pstate_switch_workfn);
1152         work->freq = table[last_valid].frequency;
1153         queue_work(pstate_wq, (struct work_struct*)work);
1154     }
1155
1156     palacios_free(policy);
1157     return 0;
1158
1159 out_err1: 
1160     palacios_free(work);
1161 out_err:
1162     palacios_free(policy);
1163     return -1;
1164 }
1165
1166
1167 static int linux_set_freq(uint64_t f)
1168 {
1169     struct cpufreq_policy * policy = NULL;
1170     pstate_work_t * work = NULL;
1171     uint64_t freq;
1172     unsigned int cpu = get_cpu();
1173     put_cpu();
1174
1175     policy = palacios_alloc(sizeof(struct cpufreq_policy));
1176     if (!policy) {
1177         ERROR("Could not allocate policy struct\n");
1178         return -1;
1179     }
1180
1181     work = (pstate_work_t*)palacios_alloc(sizeof(pstate_work_t));
1182     if (!work) {
1183         ERROR("Could not allocate work struct\n");
1184         goto out_err;
1185     }
1186
1187     if (cpufreq_get_policy(policy, cpu) != 0) {
1188         ERROR("Could not get cpufreq policy\n");
1189         goto out_err1;
1190     }
1191
1192     if (f < policy->min) {
1193         freq = policy->min;
1194     } else if (f > policy->max) {
1195         freq = policy->max;
1196     } else {
1197         freq = f;
1198     }
1199
1200     INIT_WORK((struct work_struct*)work, pstate_switch_workfn);
1201     work->freq = freq;
1202     queue_work(pstate_wq, (struct work_struct*)work);
1203
1204     palacios_free(policy);
1205     return 0;
1206
1207 out_err1:
1208     palacios_free(work);
1209 out_err:
1210     palacios_free(policy);
1211     return -1;
1212 }
1213
1214
1215 static int linux_restore_defaults(void)
1216 {
1217     char * gov = NULL;
1218     unsigned int cpu = get_cpu();
1219     put_cpu();
1220
1221     gov = get_cpu_var(core_state).linux_governor;
1222     put_cpu_var(core_state);
1223
1224     DEBUG("restoring previous governor (%s)\n", gov);
1225
1226     if (governor_switch(gov, cpu) < 0) {
1227         ERROR("Could not restore governor to (%s)\n", gov);
1228         goto out_err;
1229     }
1230
1231     free_linux_governor();
1232     return 0;
1233
1234 out_err:
1235     free_linux_governor();
1236     return -1;
1237 }
1238
1239
1240
1241 /******************************************************************
1242   Generic Interface as provided to Palacios and to the rest of the
1243   module
1244  ******************************************************************/
1245
1246 static void init_core(void)
1247 {
1248     unsigned cpu;
1249     struct cpufreq_policy *p;
1250
1251
1252     //DEBUG("P-State Core Init\n");
1253
1254     get_cpu_var(core_state).mode = V3_PSTATE_HOST_CONTROL;
1255     get_cpu_var(core_state).cur_pstate = 0;
1256
1257     if (machine_state.funcs) {
1258         get_cpu_var(core_state).min_pstate = machine_state.funcs->get_min_pstate();
1259         get_cpu_var(core_state).max_pstate = machine_state.funcs->get_max_pstate();
1260     } else {
1261         get_cpu_var(core_state).min_pstate = 0;
1262         get_cpu_var(core_state).max_pstate = 0;
1263     }
1264
1265
1266     cpu = get_cpu(); put_cpu();
1267
1268     p = cpufreq_cpu_get(cpu);
1269
1270     if (!p) { 
1271         get_cpu_var(core_state).have_cpufreq = 0;
1272         get_cpu_var(core_state).min_freq_khz=0;
1273         get_cpu_var(core_state).max_freq_khz=0;
1274         get_cpu_var(core_state).cur_freq_khz=0;
1275     } else {
1276         get_cpu_var(core_state).have_cpufreq = 1;
1277         get_cpu_var(core_state).min_freq_khz=p->min;
1278         get_cpu_var(core_state).max_freq_khz=p->max;
1279         get_cpu_var(core_state).cur_freq_khz=p->cur; } cpufreq_cpu_put(p); 
1280     put_cpu_var(core_state);
1281
1282     /*
1283     for (i=0;i<get_cpu_var(processors)->performance->state_count; i++) { 
1284         INFO("P-State: %u: freq=%llu ctrl=%llx",
1285                 i, 
1286                 get_cpu_var(processors)->performance->states[i].core_frequency*1000,
1287                 get_cpu_var(processors)->performance->states[i].control);
1288    }
1289    put_cpu_var(processors);
1290     */
1291 }
1292
1293
1294 void palacios_pstate_ctrl_release(void);
1295
1296
1297 static void deinit_core(void)
1298 {
1299     DEBUG("P-State Core Deinit\n");
1300     palacios_pstate_ctrl_release();
1301
1302 }
1303
1304
1305
1306 void palacios_pstate_ctrl_get_chars(struct v3_cpu_pstate_chars *c) 
1307 {
1308     memset(c,0,sizeof(struct v3_cpu_pstate_chars));
1309
1310
1311     c->features = V3_PSTATE_INTERNAL_CONTROL;
1312
1313     if (get_cpu_var(core_state).have_cpufreq) {
1314         c->features |= V3_PSTATE_EXTERNAL_CONTROL;
1315     }
1316
1317     if (machine_state.arch==AMD || machine_state.arch==INTEL) { 
1318         c->features |= V3_PSTATE_DIRECT_CONTROL;
1319     }
1320     c->cur_mode = get_cpu_var(core_state).mode;
1321     c->min_pstate = get_cpu_var(core_state).min_pstate;
1322     c->max_pstate = get_cpu_var(core_state).max_pstate;
1323     c->cur_pstate = get_cpu_var(core_state).cur_pstate;
1324     c->min_freq_khz = get_cpu_var(core_state).min_freq_khz;
1325     c->max_freq_khz = get_cpu_var(core_state).max_freq_khz;
1326     c->cur_freq_khz = get_cpu_var(core_state).cur_freq_khz;
1327
1328     put_cpu_var(core_state);
1329
1330
1331
1332 }
1333
1334
1335 uint64_t palacios_pstate_ctrl_get_pstate(void)
1336 {
1337     if (get_cpu_var(core_state).mode==V3_PSTATE_DIRECT_CONTROL) { 
1338         put_cpu_var(core_state);
1339         return machine_state.funcs->get_pstate();
1340     } else if (get_cpu_var(core_state).mode==V3_PSTATE_EXTERNAL_CONTROL) {
1341         put_cpu_var(core_state);
1342         return linux_get_pstate();
1343     } else {
1344         put_cpu_var(core_state);
1345         return 0;
1346     }
1347 }
1348
1349
1350 void palacios_pstate_ctrl_set_pstate(uint64_t p)
1351 {
1352     if (get_cpu_var(core_state).mode==V3_PSTATE_DIRECT_CONTROL) { 
1353         put_cpu_var(core_state);
1354         machine_state.funcs->set_pstate(p);
1355     } else if (get_cpu_var(core_state).mode==V3_PSTATE_EXTERNAL_CONTROL) {
1356         put_cpu_var(core_state);
1357         linux_set_pstate(p);
1358     } else {
1359         put_cpu_var(core_state);
1360     }
1361 }
1362
1363
1364 void palacios_pstate_ctrl_set_pstate_wrapper(void *p)
1365 {
1366     palacios_pstate_ctrl_set_pstate((uint8_t)(uint64_t)p);
1367 }
1368
1369
1370 uint64_t palacios_pstate_ctrl_get_freq(void)
1371 {
1372     if (get_cpu_var(core_state).mode==V3_PSTATE_EXTERNAL_CONTROL) { 
1373         put_cpu_var(core_state);
1374         return linux_get_freq();
1375     } else {
1376         put_cpu_var(core_state);
1377         return 0;
1378     }
1379 }
1380
1381
1382 void palacios_pstate_ctrl_set_freq(uint64_t p)
1383 {
1384     if (get_cpu_var(core_state).mode==V3_PSTATE_EXTERNAL_CONTROL) { 
1385         put_cpu_var(core_state);
1386         linux_set_freq(p);
1387     } else {
1388         put_cpu_var(core_state);
1389     }
1390 }
1391
1392
1393 static int switch_to_external(void)
1394 {
1395     DEBUG("switch from host control to external\n");
1396
1397     if (!(get_cpu_var(core_state).have_cpufreq)) {
1398         put_cpu_var(core_state);
1399         ERROR("No cpufreq  - cannot switch to external...\n");
1400         return -1;
1401     } 
1402     put_cpu_var(core_state);
1403
1404     linux_setup_palacios_governor();
1405
1406     get_cpu_var(core_state).mode=V3_PSTATE_EXTERNAL_CONTROL;
1407     put_cpu_var(core_state);
1408
1409     return 0;
1410 }
1411
1412
1413 static int switch_to_direct(void)
1414 {
1415     DEBUG("switch from host control to direct\n");
1416
1417     if (get_cpu_var(core_state).have_cpufreq) { 
1418         put_cpu_var(core_state);
1419         DEBUG("switch to direct from cpufreq\n");
1420
1421         // The implementation would set the policy and governor to peg cpu
1422         // regardless of load
1423         linux_setup_palacios_governor();
1424     } else {
1425         put_cpu_var(core_state);
1426     }
1427
1428     if (machine_state.funcs && machine_state.funcs->arch_init) {
1429         get_cpu_var(core_state).mode=V3_PSTATE_DIRECT_CONTROL;
1430
1431         machine_state.funcs->arch_init();
1432
1433         put_cpu_var(core_state);
1434     }
1435
1436     return 0;
1437 }
1438
1439
1440 static int switch_to_internal(void)
1441 {
1442     DEBUG("switch from host control to internal\n");
1443
1444     if (get_cpu_var(core_state).have_cpufreq) { 
1445         put_cpu_var(core_state);
1446         DEBUG("switch to internal on machine with cpu freq\n");
1447         linux_setup_palacios_governor();
1448     } else {
1449         put_cpu_var(core_state);
1450     }
1451
1452     get_cpu_var(core_state).mode=V3_PSTATE_INTERNAL_CONTROL;
1453
1454     put_cpu_var(core_state);
1455
1456     return 0;
1457 }
1458
1459
1460 static int switch_from_external(void)
1461 {
1462     if (!(get_cpu_var(core_state).have_cpufreq)) {
1463         put_cpu_var(core_state);
1464         ERROR("No cpufreq  - how did we get here... external...\n");
1465         return -1;
1466     }
1467     put_cpu_var(core_state);
1468
1469     DEBUG("Switching back to host control from external\n");
1470
1471     if (get_cpu_var(core_state).have_cpufreq) { 
1472         put_cpu_var(core_state);
1473         linux_restore_defaults();
1474     } else {
1475         put_cpu_var(core_state);
1476     }
1477
1478     get_cpu_var(core_state).mode = V3_PSTATE_HOST_CONTROL;
1479     put_cpu_var(core_state);
1480
1481     return 0;
1482 }
1483
1484
1485 static int switch_from_direct(void)
1486 {
1487
1488     DEBUG("Switching back to host control from direct\n");
1489
1490     // Set maximum performance, just in case there is no host control
1491     machine_state.funcs->set_pstate(get_cpu_var(core_state).min_pstate);
1492     machine_state.funcs->arch_deinit();
1493
1494     if (get_cpu_var(core_state).have_cpufreq) { 
1495         put_cpu_var(core_state);
1496         linux_restore_defaults();
1497     } else {
1498         put_cpu_var(core_state);
1499     }
1500
1501     get_cpu_var(core_state).mode=V3_PSTATE_HOST_CONTROL;
1502
1503     put_cpu_var(core_state);
1504
1505     return 0;
1506 }
1507
1508
1509 static int switch_from_internal(void)
1510 {
1511     DEBUG("Switching back to host control from internal\n");
1512
1513     if (get_cpu_var(core_state).have_cpufreq) { 
1514         put_cpu_var(core_state);
1515         linux_restore_defaults();
1516     } else {
1517         put_cpu_var(core_state);
1518     }
1519
1520     get_cpu_var(core_state).mode=V3_PSTATE_HOST_CONTROL;
1521
1522     put_cpu_var(core_state);
1523
1524     return 0;
1525 }
1526
1527
1528
1529 void palacios_pstate_ctrl_acquire(uint32_t type)
1530 {
1531     if (get_cpu_var(core_state).mode != V3_PSTATE_HOST_CONTROL) { 
1532         put_cpu_var(core_state);
1533         palacios_pstate_ctrl_release();
1534     } else {
1535         put_cpu_var(core_state);
1536     }
1537
1538     switch (type) { 
1539         case V3_PSTATE_EXTERNAL_CONTROL:
1540             switch_to_external();
1541             break;
1542         case V3_PSTATE_DIRECT_CONTROL:
1543             switch_to_direct();
1544             break;
1545         case V3_PSTATE_INTERNAL_CONTROL:
1546             switch_to_internal();
1547             break;
1548         default:
1549             ERROR("Unknown pstate control type %u\n",type);
1550             break;
1551     }
1552
1553 }
1554
1555 // Wrappers for xcalls
1556 static void palacios_pstate_ctrl_acquire_external(void)
1557 {
1558     palacios_pstate_ctrl_acquire(V3_PSTATE_EXTERNAL_CONTROL);
1559 }
1560
1561 static void palacios_pstate_ctrl_acquire_direct(void)
1562 {
1563     palacios_pstate_ctrl_acquire(V3_PSTATE_DIRECT_CONTROL);
1564 }
1565
1566
1567 void palacios_pstate_ctrl_release(void)
1568 {
1569     if (get_cpu_var(core_state).mode == V3_PSTATE_HOST_CONTROL) { 
1570         put_cpu_var(core_state);
1571         return;
1572     } 
1573     put_cpu_var(core_state);
1574
1575     switch (get_cpu_var(core_state).mode) { 
1576         case V3_PSTATE_EXTERNAL_CONTROL:
1577             put_cpu_var(core_state);
1578             switch_from_external();
1579             break;
1580         case V3_PSTATE_DIRECT_CONTROL:
1581             put_cpu_var(core_state);
1582             switch_from_direct();
1583             break;
1584         case V3_PSTATE_INTERNAL_CONTROL:
1585             put_cpu_var(core_state);
1586             switch_from_internal();
1587             break;
1588         default:
1589             put_cpu_var(core_state);
1590             ERROR("Unknown pstate control type %u\n",core_state.mode);
1591             break;
1592     }
1593 }
1594
1595
1596 static void update_hw_pstate(void *arg)
1597 {
1598     if (machine_state.funcs && machine_state.funcs->get_pstate) {
1599         get_cpu_var(core_state).cur_hw_pstate = machine_state.funcs->get_pstate();
1600         put_cpu_var(core_state);
1601     } else {
1602         get_cpu_var(core_state).cur_hw_pstate = 0;
1603         put_cpu_var(core_state);
1604     }
1605 }
1606
1607
1608 /***************************************************************************
1609   PROC Interface to expose state
1610  ***************************************************************************/
1611
1612 static int pstate_show(struct seq_file * file, void * v)
1613 {
1614     unsigned int cpu;
1615     unsigned int numcpus = num_online_cpus();
1616
1617     seq_printf(file, "V3VEE DVFS Status\n\n");
1618
1619     for (cpu=0;cpu<numcpus;cpu++) { 
1620         palacios_xcall(cpu,update_hw_pstate,0);
1621     }
1622
1623     for (cpu=0;cpu<numcpus;cpu++) { 
1624         struct pstate_core_info *s = &per_cpu(core_state,cpu);
1625         seq_printf(file,"pcore %u: hw pstate 0x%llx mode %s ",cpu,
1626                 s->cur_hw_pstate,
1627                 s->mode==V3_PSTATE_HOST_CONTROL ? "host" :
1628                 s->mode==V3_PSTATE_EXTERNAL_CONTROL ? "external" :
1629                 s->mode==V3_PSTATE_DIRECT_CONTROL ? "direct" : 
1630                 s->mode==V3_PSTATE_INTERNAL_CONTROL ? "internal" : "UNKNOWN");
1631         if (s->mode==V3_PSTATE_EXTERNAL_CONTROL) { 
1632             seq_printf(file,"(min=%llu max=%llu cur=%llu) ", s->min_freq_khz, s->max_freq_khz, s->cur_freq_khz);
1633         } 
1634         if (s->mode==V3_PSTATE_DIRECT_CONTROL) { 
1635             seq_printf(file,"(min=%llu max=%llu cur=%llu) ",s->min_pstate, s->max_pstate, s->cur_pstate);
1636         }
1637         seq_printf(file,"\n");
1638     }
1639     return 0;
1640 }
1641
1642 static int pstate_open(struct inode * inode, struct file * file) 
1643 {
1644     return single_open(file, pstate_show, NULL);
1645 }
1646
1647
1648 static struct file_operations pstate_fops = {
1649     .owner = THIS_MODULE,
1650     .open = pstate_open, 
1651     .read = seq_read,
1652     .llseek = seq_lseek,
1653     .release = seq_release
1654 };
1655
1656 static int pstate_hw_show(struct seq_file * file, void * v)
1657 {
1658     int numstates;
1659
1660     seq_printf(file, "V3VEE DVFS Hardware Info\n(all logical cores assumed identical)\n\n");
1661
1662     seq_printf(file, "Arch:   \t%s\n"
1663                      "PStates:\t%s\n\n",
1664             machine_state.arch==INTEL ? "Intel" : 
1665             machine_state.arch==AMD ? "AMD" : "Other",
1666             machine_state.supports_pstates ? "Yes" : "No");
1667
1668
1669 #define YN(x) ((x) ? "Y" : "N")
1670
1671     if (machine_state.arch==INTEL) {
1672         seq_printf(file,"SpeedStep:           \t%s\n",YN(machine_state.have_speedstep));
1673         seq_printf(file,"APERF/MPERF:         \t%s\n",YN(machine_state.have_pstate_hw_coord));
1674         seq_printf(file,"IDA or TurboCore:    \t%s\n",YN(machine_state.have_opportunistic));
1675         seq_printf(file,"Policy Hint:         \t%s\n",YN(machine_state.have_policy_hint));
1676         seq_printf(file,"Hardware Policy:     \t%s\n",YN(machine_state.have_hwp));
1677         seq_printf(file,"Hardware Duty Cycle: \t%s\n",YN(machine_state.have_hdc));
1678         seq_printf(file,"MWAIT extensions:    \t%s\n",YN(machine_state.have_mwait_ext));
1679         seq_printf(file,"MWAIT wake on intr:  \t%s\n",YN(machine_state.have_mwait_int));
1680     } 
1681
1682     if (machine_state.arch==AMD) { 
1683         seq_printf(file,"PState:              \t%s\n",YN(machine_state.have_pstate));
1684         seq_printf(file,"APERF/MPERF:         \t%s\n",YN(machine_state.have_pstate_hw_coord));
1685         seq_printf(file,"CoreBoost:           \t%s\n",YN(machine_state.have_coreboost));
1686         seq_printf(file,"Feedback:            \t%s\n",YN(machine_state.have_feedback));
1687     }
1688
1689
1690     seq_printf(file,"\nPstate\tCtrl\tKHz\tmW\tuS(X)\tuS(B)\n");
1691     numstates = get_cpu_var(processors)->performance->state_count;
1692     if (!numstates) { 
1693         seq_printf(file,"UNKNOWN\n");
1694     } else {
1695         int i;
1696         for (i=0;i<numstates;i++) { 
1697             seq_printf(file,
1698                        "%u\t%llx\t%llu\t%llu\t%llu\t%llu\n",
1699                        i, 
1700                        get_cpu_var(processors)->performance->states[i].control,
1701                        get_cpu_var(processors)->performance->states[i].core_frequency*1000,
1702                        get_cpu_var(processors)->performance->states[i].power,
1703                        get_cpu_var(processors)->performance->states[i].transition_latency,
1704                        get_cpu_var(processors)->performance->states[i].bus_master_latency);
1705         }
1706     }
1707     put_cpu_var(processors);
1708
1709     seq_printf(file,"\nAvailable Modes:");
1710     seq_printf(file," host");
1711     if (get_cpu_var(core_state).have_cpufreq) { 
1712         seq_printf(file," external");
1713     }
1714     put_cpu_var(core_state);
1715     if (machine_state.supports_pstates) {
1716         seq_printf(file," direct");
1717     }
1718     seq_printf(file," internal\n");
1719
1720     return 0;
1721 }
1722
1723 static int pstate_hw_open(struct inode * inode, struct file * file) 
1724 {
1725     return single_open(file, pstate_hw_show, NULL);
1726 }
1727
1728
1729 static struct file_operations pstate_hw_fops = {
1730     .owner = THIS_MODULE,
1731     .open = pstate_hw_open, 
1732     .read = seq_read,
1733     .llseek = seq_lseek,
1734     .release = seq_release
1735 };
1736
1737
1738 int pstate_proc_setup(void)
1739 {
1740     struct proc_dir_entry *proc;
1741     struct proc_dir_entry *prochw;
1742
1743     proc = create_proc_entry("v3-dvfs",0444, palacios_get_procdir());
1744
1745     if (!proc) { 
1746         ERROR("Failed to create proc entry for p-state control\n");
1747         return -1;
1748     }
1749
1750     proc->proc_fops = &pstate_fops;
1751
1752     INFO("/proc/v3vee/v3-dvfs successfully created\n");
1753
1754     prochw = create_proc_entry("v3-dvfs-hw",0444,palacios_get_procdir());
1755
1756
1757     if (!prochw) { 
1758         ERROR("Failed to create proc entry for p-state hw info\n");
1759         return -1;
1760     }
1761
1762     prochw->proc_fops = &pstate_hw_fops;
1763
1764     INFO("/proc/v3vee/v3-dvfs-hw successfully created\n");
1765
1766     return 0;
1767 }
1768
1769 void pstate_proc_teardown(void)
1770 {
1771     remove_proc_entry("v3-dvfs-hw",palacios_get_procdir());
1772     remove_proc_entry("v3-dvfs",palacios_get_procdir());
1773 }
1774
1775 /********************************************************************
1776   User interface (ioctls)
1777  ********************************************************************/
1778
1779 static int dvfs_ctrl(unsigned int cmd, unsigned long arg) 
1780 {
1781     struct v3_dvfs_ctrl_request r;
1782
1783     if (copy_from_user(&r,(void __user*)arg,sizeof(struct v3_dvfs_ctrl_request))) {
1784         ERROR("Failed to copy DVFS request from user\n");
1785         return -EFAULT;
1786     }
1787
1788     if (r.pcore >= num_online_cpus()) {
1789         ERROR("Cannot apply DVFS request to pcore %u\n",r.pcore);
1790         return -EFAULT;
1791     }
1792
1793     switch (r.cmd) {
1794         case V3_DVFS_ACQUIRE: {
1795                                   switch (r.acq_type) { 
1796                                       case V3_DVFS_EXTERNAL:
1797                                           palacios_xcall(r.pcore,(void (*)(void*))palacios_pstate_ctrl_acquire_external, NULL);
1798                                           return 0;
1799                                           break;
1800                                       case V3_DVFS_DIRECT:
1801                                           palacios_xcall(r.pcore,(void (*)(void*))palacios_pstate_ctrl_acquire_direct, NULL);
1802                                           return 0;
1803                                           break;
1804                                       default:
1805                                           ERROR("Unknown DVFS acquire type %u\n",r.acq_type);
1806                                           return -EFAULT;
1807                                   }
1808                               }
1809                               break;
1810         case V3_DVFS_RELEASE: {
1811                                   palacios_xcall(r.pcore,(void (*)(void*))palacios_pstate_ctrl_release, NULL);
1812                                   return 0;
1813                               }
1814                               break;
1815         case V3_DVFS_SETFREQ: {
1816                                   palacios_xcall(r.pcore,(void (*)(void*))palacios_pstate_ctrl_set_freq,(void*)r.freq_khz);
1817                                   return 0;
1818                               }
1819                               break;
1820         case V3_DVFS_SETPSTATE: {
1821                                     palacios_xcall(r.pcore,(void (*)(void*))palacios_pstate_ctrl_set_pstate_wrapper,(void*)(uint64_t)r.pstate);
1822                                     return 0;
1823                                 }
1824         default: {
1825                      ERROR("Unknown DVFS command %u\n",r.cmd);
1826                      return -EFAULT;
1827                  }
1828                  break;
1829     }
1830 }
1831
1832
1833 void pstate_user_setup(void)
1834 {
1835     add_global_ctrl(V3_DVFS_CTRL, dvfs_ctrl);
1836 }
1837
1838
1839 void pstate_user_teardown(void)
1840 {
1841     remove_global_ctrl(V3_DVFS_CTRL);
1842 }
1843
1844 static struct v3_host_pstate_ctrl_iface hooks = {
1845     .get_chars = palacios_pstate_ctrl_get_chars,
1846     .acquire = palacios_pstate_ctrl_acquire,
1847     .release = palacios_pstate_ctrl_release,
1848     .set_pstate = palacios_pstate_ctrl_set_pstate,
1849     .get_pstate = palacios_pstate_ctrl_get_pstate,
1850     .set_freq = palacios_pstate_ctrl_set_freq,
1851     .get_freq = palacios_pstate_ctrl_get_freq,
1852 };
1853
1854
1855
1856 static int pstate_ctrl_init(void) 
1857 {
1858     unsigned int cpu;
1859     unsigned int numcpus = num_online_cpus();
1860
1861     pstate_arch_setup();
1862
1863     for (cpu=0;cpu<numcpus;cpu++) { 
1864         palacios_xcall(cpu,(void ((*)(void*)))init_core,0);
1865     }
1866
1867     V3_Init_Pstate_Ctrl(&hooks);  
1868
1869     if (pstate_proc_setup()) { 
1870         ERROR("Unable to initialize P-State Control\n");
1871         return -1;
1872     }
1873
1874     pstate_user_setup();
1875
1876     pstate_linux_init();
1877
1878     INFO("P-State Control Initialized\n");
1879
1880     return 0;
1881 }
1882
1883 static int pstate_ctrl_deinit(void)
1884 {
1885     unsigned int cpu;
1886     unsigned int numcpus=num_online_cpus();
1887
1888     pstate_linux_deinit();
1889
1890     pstate_user_teardown();
1891
1892     pstate_proc_teardown();
1893
1894     // release pstate control if we have it, and we need to do this on each processor
1895     for (cpu=0;cpu<numcpus;cpu++) { 
1896         palacios_xcall(cpu,(void (*)(void *))deinit_core,0);
1897     }
1898
1899
1900     // Free any mapping table we built for Intel
1901     if (intel_pstate_to_ctrl && intel_pstate_to_ctrl != intel_pstate_to_ctrl_internal) { 
1902         palacios_free(intel_pstate_to_ctrl);
1903     }
1904
1905
1906     return 0;
1907 }
1908
1909
1910 static struct linux_ext pstate_ext = {
1911     .name = "PSTATE_CTRL",
1912     .init = pstate_ctrl_init,
1913     .deinit = pstate_ctrl_deinit,
1914     .guest_init = NULL,
1915     .guest_deinit = NULL,
1916 };
1917
1918
1919 register_extension(&pstate_ext);
1920
1921
1922