Palacios Public Git Repository

To checkout Palacios execute

  git clone http://v3vee.org/palacios/palacios.web/palacios.git
This will give you the master branch. You probably want the devel branch or one of the release branches. To switch to the devel branch, simply execute
  cd palacios
  git checkout --track -b devel origin/devel
The other branches are similar.


fix pstate set lockup by setting frequency using linux work queues
[palacios.git] / linux_module / iface-pstate-ctrl.c
1 /*
2  * This file is part of the Palacios Virtual Machine Monitor developed
3  * by the V3VEE Project with funding from the United States National 
4  * Science Foundation and the Department of Energy.  
5  *
6  * The V3VEE Project is a joint project between Northwestern University
7  * and the University of New Mexico.  You can find out more at 
8  * http://www.v3vee.org
9  *
10  * Copyright (c) 2014, the V3VEE Project <http://www.v3vee.org>
11  * all rights reserved.
12  *
13  * Author: Kyle C. Hale <kh@u.northwestern.edu>
14  *         Shiva Rao <shiva.rao.717@gmail.com>
15  *         Peter Dinda <pdinda@northwestern.edu>
16  *
17  * This is free software.  you are permitted to use,
18  * redistribute, and modify it as specified in the file "V3VEE_LICENSE".
19  */
20
21 #include <linux/uaccess.h>
22 #include <linux/seq_file.h>
23 #include <linux/proc_fs.h>
24 #include <linux/cpufreq.h>
25 #include <linux/kernel.h>
26 #include <linux/kmod.h>
27 #include <linux/module.h>
28 #include <linux/string.h>
29 #include <linux/interrupt.h>
30 #include <asm/processor.h>
31 #include <asm/msr.h>
32 #include <asm/msr-index.h>
33
34 // Used to determine the appropriate pstates values on Intel
35 #include <linux/acpi.h>
36 #include <acpi/processor.h>
37
38 #include <interfaces/vmm_pstate_ctrl.h>
39
40 #include "palacios.h"
41 #include "iface-pstate-ctrl.h"
42
43 #include "linux-exts.h"
44
45 /*
46    This P-STATE control implementation includes:
47
48    - Direct control of Intel and AMD processor pstates
49    - External control of processor states via Linux (unimplemented)
50    - Internal control of processor states in Palacios (handoff from Linux)
51
52    Additionally, it provides a user-space interface for manipulating
53    p-state regardless of the host's functionality.  This includes
54    an ioctl for commanding the implementation and a /proc file for 
55    showing current status and capabilities.
56
57    What we mean by "pstate" here is the processor's internal
58    configuration.   For AMD, this is defined as being the same as
59    the ACPI-defined p-state.  For Intel, it is not.  There, it is the 
60    contents of the perf ctl MSR, which, often, is the frequency id 
61    and voltage id (the multipliers).
62
63 */
64
65
66 #define PALACIOS_GOVNAME "v3vee"
67 #define MAX_PATH_LEN     128
68 #define MAX_GOV_NAME_LEN 16
69
70
71 struct pstate_core_info {
72     // Here we have the notion of host control
73 #define V3_PSTATE_HOST_CONTROL 0
74     // and all the modes from the Palacios interface:
75     // V3_PSTATE_EXTERNAL_CONTROL
76     // V3_PSTATE_DIRECT_CONTROL
77     // V3_PSTATE_INTERNAL_CONTROL
78     uint32_t mode;
79
80     // Apply if we are under the DIRECT state
81     uint8_t cur_pstate;
82     uint8_t max_pstate;
83     uint8_t min_pstate;
84
85     uint8_t cur_hw_pstate;
86
87     // Apply if we are under the EXTERNAL state
88     uint64_t cur_freq_khz;
89     uint64_t max_freq_khz;
90     uint64_t min_freq_khz;
91
92     // Intel-specific
93     uint8_t prior_speedstep;
94     uint8_t turbo_disabled;
95     uint8_t no_turbo;
96
97     int have_cpufreq;
98
99     // This is where we stash Linux's governor when we make a mode switch
100     char * linux_governor;
101     // We have this so we can restore the original frequency when we started
102     uint64_t original_hz; 
103
104 };
105
106
107 static DEFINE_PER_CPU(struct pstate_core_info, core_state);
108
109
110
111 // These are used to assert DIRECT control over the core pstates
112 struct pstate_core_funcs {
113     void    (*arch_init)(void);
114     void    (*arch_deinit)(void);
115     uint64_t (*get_min_pstate)(void);
116     uint64_t (*get_max_pstate)(void);
117     uint64_t (*get_pstate)(void);
118     void    (*set_pstate)(uint64_t pstate);
119 };
120
121 struct pstate_machine_info {
122     enum {INTEL, AMD, OTHER } arch;
123     int supports_pstates;
124
125
126     // For AMD
127     int have_pstate;
128     int have_coreboost;
129     int have_feedback;  
130
131     // For Intel
132     int have_speedstep;
133     int have_opportunistic; // this means "Turbo Boost" or "IDA"
134     int have_policy_hint;
135     int have_hwp;       // hardware-controlled performance states
136     int have_hdc;       // hardware duty cycling
137     int have_mwait_ext; // mwait power extensions
138     int have_mwait_int; // mwait wakes on interrupt
139
140     // for both
141     int have_pstate_hw_coord;  // mperf/aperf
142
143     // used for DIRECT control
144     struct pstate_core_funcs *funcs;
145
146 };
147
148 static struct pstate_machine_info machine_state;
149
150
151 /****************************************************
152   AMD  DIRECT CONTROL
153  ***************************************************/
154
155 /* AMD Programmer's Manual Vol 2 (Rev 3, 2013), Sec. 17.1, pp.557 */
156 #define MSR_PSTATE_LIMIT_REG_AMD 0xc0010061
157 #define MSR_PSTATE_CTL_REG_AMD   0xc0010062
158 #define MSR_PSTATE_STAT_REG_AMD  0xc0010063
159
160 struct p_state_limit_reg_amd {
161     union {
162         uint64_t val;
163         struct {
164             uint8_t  pstate_limit : 4; /* lowest P-state value (highest perf.) supported currently (this can change at runtime) */
165             uint8_t  pstate_max   : 4; /* highest P-state value supported  (lowest perf) */
166             uint64_t rsvd         : 56;
167         } reg;
168     } __attribute__((packed));
169 } __attribute__((packed));
170
171
172 struct p_state_stat_reg_amd {
173     union {
174         uint64_t val;
175         struct {
176             uint8_t  pstate  : 4;
177             uint64_t rsvd    : 60;
178         } reg;
179     } __attribute__((packed));
180 } __attribute__((packed));
181
182
183 struct p_state_ctl_reg_amd {
184     union {
185         uint64_t val;
186         struct {
187             uint8_t  cmd  : 4;
188             uint64_t rsvd : 60;
189         } reg;
190     } __attribute__((packed));
191 } __attribute__((packed));
192
193
194 /* CPUID Fn8000_0007_EDX[HwPstate(7)] = 1 */
195 static uint8_t supports_pstates_amd (void)
196 {
197     uint32_t eax, ebx, ecx, edx;
198
199     cpuid(0x80000007, &eax, &ebx, &ecx, &edx);
200     machine_state.have_pstate = !!(edx & (1 << 7));
201     machine_state.have_coreboost = !!(edx & (1<<9));
202     machine_state.have_feedback = !!(edx & (1<<11));
203
204     cpuid(0x6, &eax, &ebx, &ecx, &edx);
205     machine_state.have_pstate_hw_coord =  !!(ecx & 1); 
206
207     INFO("P-State: AMD: Pstates=%d Coreboost=%d Feedback=%d PstateHWCoord=%d\n",
208             machine_state.have_pstate, 
209             machine_state.have_coreboost, 
210             machine_state.have_feedback,
211             machine_state.have_pstate_hw_coord);
212
213     return machine_state.have_pstate;
214
215
216 }
217
218
219 static void init_arch_amd(void)
220 {
221     /* KCH: nothing to do here */
222 }
223
224
225 static void deinit_arch_amd(void)
226 {
227     /* KCH: nothing to do here */
228 }
229
230
231 static uint64_t get_pstate_amd(void) 
232 {
233     struct p_state_stat_reg_amd pstat;
234
235     rdmsrl(MSR_PSTATE_STAT_REG_AMD, pstat.val);
236
237     get_cpu_var(core_state).cur_pstate=pstat.reg.pstate;
238     put_cpu_var(core_state);
239
240     return pstat.reg.pstate;
241 }
242
243
244 static void set_pstate_amd(uint64_t p)
245 {
246     struct p_state_ctl_reg_amd pctl;
247     pctl.val = 0;
248     pctl.reg.cmd = p;
249
250     wrmsrl(MSR_PSTATE_CTL_REG_AMD, pctl.val);
251
252     get_cpu_var(core_state).cur_pstate=p;
253     put_cpu_var(core_state);
254 }
255
256
257 /*
258  * NOTE: HW may change this value at runtime
259  */
260 static uint64_t get_max_pstate_amd(void)
261 {
262     struct p_state_limit_reg_amd plimits;
263
264     rdmsrl(MSR_PSTATE_LIMIT_REG_AMD, plimits.val);
265
266     return plimits.reg.pstate_max;
267 }
268
269
270 static uint64_t get_min_pstate_amd(void)
271 {
272     struct p_state_limit_reg_amd plimits;
273
274     rdmsrl(MSR_PSTATE_LIMIT_REG_AMD, plimits.val);
275
276     return plimits.reg.pstate_limit;
277 }
278
279
280 static struct pstate_core_funcs amd_funcs =
281 {
282     .arch_init        = init_arch_amd,
283     .arch_deinit      = deinit_arch_amd,
284     .get_pstate       = get_pstate_amd,
285     .set_pstate       = set_pstate_amd,
286     .get_max_pstate   = get_max_pstate_amd,
287     .get_min_pstate   = get_min_pstate_amd,
288 };
289
290
291
292 /***********************************************************
293   INTEL DIRECT CONTROL
294  **********************************************************/
295
296
297 /*
298    This implementation uses SpeedStep, but does check
299    to see if the other features (MPERF/APERF, Turbo/IDA, HWP)
300    are available.
301    */
302
303 /* Intel System Programmer's Manual Vol. 3B, 14-2 */
304 #define MSR_MPERF_IA32         0x000000e7
305 #define MSR_APERF_IA32         0x000000e8
306 #define MSR_MISC_ENABLE_IA32   0x000001a0
307 #define MSR_NHM_TURBO_RATIO_LIMIT   0x000001ad
308 #define MSR_PLATFORM_INFO_IA32 0x000000ce
309 #define MSR_PERF_CTL_IA32      0x00000199
310 #define MSR_PERF_STAT_IA32     0x00000198
311 #define MSR_ENERY_PERF_BIAS_IA32 0x000001b0
312
313
314 /* Note that the actual  meaning of the pstate
315    in the control and status registers is actually
316    implementation dependent, unlike AMD.   The "official"
317    way to figure it out the mapping from pstate to 
318    these values is via ACPI.  What is written in the register
319    is an "id" of an operation point
320
321    "Often", the 16 bit field consists of a high order byte
322    which is the frequency (the multiplier) and the low order
323    byte is the voltage. 
324    */
325 // MSR_PERF_CTL_IA32  r/w
326 struct perf_ctl_reg_intel {
327     union {
328         uint64_t val;
329         struct {
330             // This is the target
331             // Note, not the ACPI pstate, but
332             // Intel's notion of pstate is that it's opaque
333             // for lots of implementations it seems to be
334             // frequency_id : voltage_id
335             // where frequency_id is typically the multiplier
336             uint16_t pstate                 : 16;
337             uint16_t reserved               : 16;
338             // set to 1 to *disengage* dynamic acceleration
339             // Note that "IDA" and "Turbo" use the same interface
340             uint16_t dynamic_accel_disable  : 1;
341             uint32_t reserved2              : 31;
342         } reg;
343     } __attribute__((packed));
344 } __attribute__((packed));
345
346 // MSR_PERF_STAT_IA32 r
347 struct perf_stat_reg_intel {
348     union {
349         uint64_t val;
350         struct {
351             // this is the current
352             uint16_t pstate                 : 16;
353             uint64_t reserved               : 48;
354         } reg;
355     } __attribute__((packed));
356 } __attribute__((packed));
357
358 // MSR_ENERGY_PERF_BIAS_IA32 r/w
359 struct enery_perf_bias_reg_intel {
360     union {
361         uint64_t val;
362         struct {
363             // this is the current
364             uint8_t  policy_hint            : 4;
365             uint64_t reserved               : 60;
366         } reg;
367     } __attribute__((packed));
368 } __attribute__((packed));
369
370 // MSR_PLATFORM_INFO
371 struct turbo_mode_info_reg_intel {
372     union {
373         uint64_t val;
374         struct {
375             uint8_t  rsvd0                  : 8;
376             uint8_t  max_noturbo_ratio      : 8;
377             uint8_t  rsvd1                  : 7;
378             uint8_t  ppin_cap               : 1;
379             uint8_t  rsvd2                  : 4;
380             uint8_t  ratio_limit            : 1; 
381             uint8_t  tdc_tdp_limit          : 1;
382             uint16_t rsvd3                  : 10;
383             uint8_t  min_ratio              : 8;
384             uint16_t rsvd4                  : 16;
385         } reg;
386     } __attribute__((packed));
387 } __attribute__((packed));
388
389 // This replicates the critical information in Linux's struct acpi_processor_px
390 // To make it easier to port to other OSes.    
391 struct intel_pstate_info {
392     uint64_t freq;  // KHz
393     uint64_t ctrl;  // What to write into the _CTL MSR to get this
394 };
395
396 // The internal array will be used if we cannot build the table locally
397 static struct intel_pstate_info *intel_pstate_to_ctrl_internal=0;
398 static int intel_num_pstates_internal=0;
399
400 // These will either point to the internal array or to a constructed array
401 static struct intel_pstate_info *intel_pstate_to_ctrl=0;
402 static int intel_num_pstates=0;
403
404
405 /* CPUID.01:ECX.AES(7) */
406 static uint8_t supports_pstates_intel(void)
407 {
408     /* NOTE: CPUID.06H:ECX.SETBH[bit 3] is set and it also implies the presence of a new architectural MSR called IA32_ENERGY_PERF_BIAS (1B0H).
409     */
410     uint32_t eax, ebx, ecx, edx;
411
412     cpuid(0x1, &eax, &ebx, &ecx, &edx);
413     machine_state.have_speedstep =  !!(ecx & (1 << 7));
414
415     cpuid(0x6, &eax, &ebx, &ecx, &edx);
416     machine_state.have_pstate_hw_coord =  !!(ecx & 1); // ?
417     machine_state.have_opportunistic =  !!(eax & 1<<1);
418     machine_state.have_policy_hint = !!(ecx & 1<<3);
419     machine_state.have_hwp = !!(eax & 1<<7);
420     machine_state.have_hdc = !!(eax & 1<<13);
421
422     cpuid(0x5, &eax, &ebx, &ecx, &edx);
423     machine_state.have_mwait_ext =  !!(ecx & 1);
424     machine_state.have_mwait_int =  !!(ecx & 1<<1);
425
426
427     INFO("P-State: Intel: Speedstep=%d, PstateHWCoord=%d, Opportunistic=%d PolicyHint=%d HWP=%d HDC=%d, MwaitExt=%d MwaitInt=%d \n",
428             machine_state.have_speedstep, 
429             machine_state.have_pstate_hw_coord, 
430             machine_state.have_opportunistic,
431             machine_state.have_policy_hint,
432             machine_state.have_hwp,
433             machine_state.have_hdc,
434             machine_state.have_mwait_ext,
435             machine_state.have_mwait_int );
436
437
438     if (machine_state.have_speedstep) {
439         uint32_t i;
440         // Build mapping table (from "pstate" (0..) to ctrl value for MSR
441         if (!(get_cpu_var(processors)) || !(get_cpu_var(processors)->performance) ) { 
442             put_cpu_var(processors);
443             // no acpi...  revert to internal table
444             intel_pstate_to_ctrl=intel_pstate_to_ctrl_internal;
445             intel_num_pstates=intel_num_pstates_internal;
446         } else {
447             intel_num_pstates = get_cpu_var(processors)->performance->state_count;
448             if (intel_num_pstates) { 
449                 intel_pstate_to_ctrl = palacios_alloc(sizeof(struct intel_pstate_info)*intel_num_pstates);
450                 if (!intel_pstate_to_ctrl) { 
451                     ERROR("P-State: Cannot allocate space for mapping...\n");
452                     intel_num_pstates=0;
453                 }
454                 for (i=0;i<intel_num_pstates;i++) { 
455                     intel_pstate_to_ctrl[i].freq = get_cpu_var(processors)->performance->states[i].core_frequency*1000;
456                     intel_pstate_to_ctrl[i].ctrl = get_cpu_var(processors)->performance->states[i].control;
457                 }
458                     
459             } else {
460                 ERROR("P-State: Strange, machine has ACPI DVFS but no states...\n");
461             }
462         }
463         put_cpu_var(processors);
464         INFO("P-State: Intel - State Mapping (%u states) follows\n",intel_num_pstates);
465         for (i=0;i<intel_num_pstates;i++) {
466             INFO("P-State: Intel Mapping %u:  freq=%llu  ctrl=%llx\n",
467                  i, intel_pstate_to_ctrl[i].freq,intel_pstate_to_ctrl[i].ctrl);
468         }
469     } else {
470         INFO("P-State: Intel:  No speedstep here\n");
471     }
472         
473
474     return machine_state.have_speedstep;
475 }
476
477
478 static void init_arch_intel(void)
479 {
480     uint64_t val;
481
482     rdmsrl(MSR_MISC_ENABLE_IA32, val);
483
484     //INFO("P-State: prior ENABLE=%llx\n",val);
485
486     // store prior speedstep setting
487     get_cpu_var(core_state).prior_speedstep=(val >> 16) & 0x1;
488     put_cpu_var(core_state);
489
490     // enable speedstep (probably already on)
491     val |= 1 << 16;
492     wrmsrl(MSR_MISC_ENABLE_IA32, val);
493
494     //INFO("P-State: write ENABLE=%llx\n",val);
495
496 }
497
498 static void deinit_arch_intel(void)
499 {
500     uint64_t val;
501
502     rdmsrl(MSR_MISC_ENABLE_IA32, val);
503
504     //INFO("P-State: deinit: ENABLE=%llx\n",val);
505
506     val &= ~(1ULL << 16);
507     val |= get_cpu_var(core_state).prior_speedstep << 16;
508     put_cpu_var(core_state);
509
510     wrmsrl(MSR_MISC_ENABLE_IA32, val);
511
512     //INFO("P-state: deinit ENABLE=%llx\n",val);
513
514 }
515
516 /* TODO: Intel P-states require sampling at intervals... */
517 static uint64_t get_pstate_intel(void)
518 {
519     uint64_t val;
520
521     rdmsrl(MSR_PERF_STAT_IA32,val);
522
523     //INFO("P-State: Get: 0x%llx\n", val);
524
525     // should check if turbo is active, in which case 
526     // this value is not the whole story
527
528     return val;
529 }
530
531 static void set_pstate_intel(uint64_t p)
532 {
533     uint64_t val;
534     uint64_t ctrl;
535
536     if (intel_num_pstates==0) { 
537         return ;
538     } else {
539         if (p>=intel_num_pstates) { 
540             p=intel_num_pstates-1;
541         }
542     }
543
544     ctrl=intel_pstate_to_ctrl[p].ctrl;
545
546     /* ...Intel IDA (dynamic acceleration)
547        if (c->no_turbo && !c->turbo_disabled) {
548        val |= 1 << 32;
549        }
550        */
551     // leave all bits along expect for the likely
552     // fid bits
553
554     rdmsrl(MSR_PERF_CTL_IA32, val);
555     INFO("P-State: Pre-Set: 0x%llx\n", val);
556
557     val &= ~0xffffULL;
558     val |= ctrl & 0xffffULL;
559
560     INFO("P-State: Set: 0x%llx\n", val);
561
562     wrmsrl(MSR_PERF_CTL_IA32, val);
563
564     get_cpu_var(core_state).cur_pstate = p;
565     put_cpu_var(core_state);
566 }
567
568
569 static uint64_t get_min_pstate_intel(void)
570 {
571     return 0;
572 }
573
574
575
576 static uint64_t get_max_pstate_intel (void)
577 {
578     if (intel_num_pstates==0) { 
579         return 0;
580     } else {
581         return intel_num_pstates-1;
582     }
583 }
584
585 static struct pstate_core_funcs intel_funcs =
586 {
587     .arch_init        = init_arch_intel,
588     .arch_deinit      = deinit_arch_intel,
589     .get_pstate       = get_pstate_intel,
590     .set_pstate       = set_pstate_intel,
591     .get_max_pstate   = get_max_pstate_intel,
592     .get_min_pstate   = get_min_pstate_intel,
593 };
594
595
596
597 /***********************************************
598   Arch determination and setup
599  ***********************************************/
600
601 static inline void cpuid_string (uint32_t id, uint32_t dest[4]) 
602 {
603     asm volatile("cpuid"
604             :"=a"(*dest),"=b"(*(dest+1)),"=c"(*(dest+2)),"=d"(*(dest+3))
605             :"a"(id));
606 }
607
608
609 static int get_cpu_vendor (char name[13])
610 {
611     uint32_t dest[4];
612     uint32_t maxid;
613
614     cpuid_string(0,dest);
615     maxid=dest[0];
616     ((uint32_t*)name)[0]=dest[1];
617     ((uint32_t*)name)[1]=dest[3];
618     ((uint32_t*)name)[2]=dest[2];
619     name[12]=0;
620
621     return maxid;
622 }
623
624
625 static int is_intel (void)
626 {
627     char name[13];
628     get_cpu_vendor(name);
629     return !strcmp(name,"GenuineIntel");
630 }
631
632
633 static int is_amd (void)
634 {
635     char name[13];
636     get_cpu_vendor(name);
637     return !strcmp(name,"AuthenticAMD");
638 }
639
640 static int pstate_arch_setup(void)
641 {
642
643     if (is_amd()) {
644         machine_state.arch = AMD;
645         machine_state.funcs = &amd_funcs;
646         machine_state.supports_pstates = supports_pstates_amd();
647         INFO("PSTATE: P-State initialized for AMD\n");
648     } else if (is_intel()) {
649         machine_state.arch  = INTEL;
650         machine_state.funcs = &intel_funcs;
651         machine_state.supports_pstates = supports_pstates_intel();
652         INFO("PSTATE: P-State initialized for INTEL (Work in progress...)\n");
653         return 0;
654
655     } else {
656         machine_state.arch = OTHER;
657         machine_state.funcs = NULL;
658         machine_state.supports_pstates = 0;
659         INFO("PSTATE: P-state control: No support for direct control on this architecture\n");
660         return 0;
661     }
662
663     return 0;
664 }
665
666
667
668 /******************************************************************
669   Linux Interface
670  *****************************************************************/
671
672
673
674 /* 
675  * This stub governor is simply a placeholder for preventing 
676  * frequency changes from the Linux side. For now, we simply leave
677  * the frequency as is when we acquire control. 
678  */
679 static int governor_run(struct cpufreq_policy *policy, unsigned int event)
680 {
681
682     switch (event) {
683         /* we can't use cpufreq_driver_target here as it can result
684          * in a circular dependency, so we'll just do nothing.
685          */
686         case CPUFREQ_GOV_START:
687         case CPUFREQ_GOV_STOP:
688         case CPUFREQ_GOV_LIMITS:
689             /* do nothing */
690             break;
691         default:
692             ERROR("Undefined governor command\n");
693             return -1;
694     }                           
695
696     return 0;
697 }
698
699
700 static struct cpufreq_governor stub_governor = 
701 {
702     .name = PALACIOS_GOVNAME,
703     .governor = governor_run,
704     .owner = THIS_MODULE,
705 };
706
707
708 static struct workqueue_struct *pstate_wq;
709
710
711 typedef struct {
712     struct work_struct work;
713     uint64_t freq;
714 } pstate_work_t;
715
716 static inline void pstate_register_linux_governor(void)
717 {
718     cpufreq_register_governor(&stub_governor);
719 }
720
721
722 static inline void pstate_unregister_linux_governor(void)
723 {
724     cpufreq_unregister_governor(&stub_governor);
725 }
726
727
728 static int pstate_linux_init(void)
729 {
730     pstate_register_linux_governor();
731     pstate_wq = create_workqueue("v3vee_pstate_wq");
732     if (!pstate_wq) {
733         ERROR("Could not create work queue\n");
734         goto out_err;
735     }
736
737     return 0;
738
739 out_err:
740     pstate_unregister_linux_governor();
741     return -1;
742 }
743
744
745 static void pstate_linux_deinit(void)
746 {
747     pstate_unregister_linux_governor();
748     flush_workqueue(pstate_wq);
749     destroy_workqueue(pstate_wq);
750 }
751
752
753 static int get_current_governor(char **buf, unsigned int cpu)
754 {
755     struct cpufreq_policy * policy = palacios_alloc(sizeof(struct cpufreq_policy));
756     char * govname = NULL;
757
758     if (!policy) {
759         ERROR("could not allocate cpufreq_policy\n");
760         return -1;
761     }
762         
763     if (cpufreq_get_policy(policy, cpu) != 0) {
764         ERROR("Could not get current cpufreq policy\n");
765         goto out_err;
766     }
767
768     /* We're in interrupt context, should probably not wait here */
769     govname = palacios_alloc(MAX_GOV_NAME_LEN);
770     if (!govname) {
771         ERROR("Could not allocate space for governor name\n");
772         goto out_err;
773     }
774
775     strncpy(govname, policy->governor->name, MAX_GOV_NAME_LEN);
776
777     get_cpu_var(core_state).linux_governor = govname;
778     put_cpu_var(core_state);
779
780     *buf = govname;
781
782     palacios_free(policy);
783
784     return 0;
785
786 out_err:
787     palacios_free(policy);
788     return -1;
789 }
790
791
792 /* passed to the userspacehelper interface for cleanup */
793 static void gov_switch_cleanup(struct subprocess_info * s)
794 {
795     palacios_free(s->argv[2]);
796     palacios_free(s->argv);
797 }
798
799
800 /* 
801  * Switch governors
802  * @s - the governor to switch to 
803  * TODO: this should probably be submitted to a work queue
804  * so we don't have to run it in interrupt context
805  */
806 static int governor_switch(char * s, unsigned int cpu)
807 {
808     char * path_str = NULL;
809     char ** argv = NULL; 
810
811     static char * envp[] = {
812         "HOME=/",
813         "TERM=linux",
814         "PATH=/sbin:/bin:/usr/sbin:/usr/bin", NULL };
815
816
817     argv = palacios_alloc(4*sizeof(char*));
818     if (!argv) {
819         ERROR("Couldn't allocate argv struct\n");
820         return -1;
821     }
822
823     path_str = palacios_alloc(MAX_PATH_LEN);
824     if (!path_str) {
825         ERROR("Couldn't allocate path string\n");
826         goto out_freeargv;
827     }
828     memset(path_str, 0, MAX_PATH_LEN);
829
830     snprintf(path_str, MAX_PATH_LEN, "echo %s > /sys/devices/system/cpu/cpu%u/cpufreq/scaling_governor", s, cpu);
831
832     argv[0] = "/bin/sh";
833     argv[1] = "-c";
834     argv[2] = path_str;
835     argv[3] = NULL;
836
837     /* KCH: we can't wait here to actually see if we succeeded, we're in interrupt context */
838     return call_usermodehelper_fns("/bin/sh", argv, envp, UMH_NO_WAIT, NULL, gov_switch_cleanup, NULL);
839
840 out_freeargv:
841     palacios_free(argv);
842     return -1;
843 }
844
845
846 static inline void free_linux_governor(void)
847 {
848     palacios_free(get_cpu_var(core_state).linux_governor);
849     put_cpu_var(core_state);
850 }
851
852
853 static int linux_setup_palacios_governor(void)
854 {
855     char * gov;
856     unsigned int cpu = get_cpu();
857
858     /* KCH:  we assume the v3vee governor is already 
859      * registered with kernel by this point 
860      */
861
862     if (get_current_governor(&gov, cpu) < 0) {
863         ERROR("Could not get current governor\n");
864         return -1;
865     }
866
867     DEBUG("saving current governor (%s)\n", gov);
868
869     get_cpu_var(core_state).linux_governor = gov;
870     put_cpu_var(core_state);
871     
872     DEBUG("setting the new governor (%s)\n", PALACIOS_GOVNAME);
873
874     /* set the new one to ours */
875     if (governor_switch(PALACIOS_GOVNAME, cpu) < 0) {
876         ERROR("Could not set governor to (%s)\n", PALACIOS_GOVNAME);
877         return -1;
878     }
879
880     return 0;
881 }
882
883
884
885 static int linux_get_pstate(void)
886 {
887     struct cpufreq_policy * policy = NULL;
888     struct cpufreq_frequency_table *table;
889     int cpu = get_cpu(); 
890     unsigned int i = 0;
891     unsigned int count = 0;
892
893
894     policy = palacios_alloc(sizeof(struct cpufreq_policy));
895     if (!policy) {
896         ERROR("Could not allocate policy struct\n");
897         return -1;
898     }
899
900     cpufreq_get_policy(policy, cpu);
901     table = cpufreq_frequency_get_table(cpu);
902
903     for (i = 0; (table[i].frequency != CPUFREQ_TABLE_END); i++) {
904
905         if (table[i].frequency == CPUFREQ_ENTRY_INVALID) {
906             continue;
907         }
908
909         if (table[i].frequency == policy->cur) {
910             break;
911         }
912
913         count++;
914     }
915
916     palacios_free(policy);
917
918     put_cpu();
919     return count;
920 }
921
922
923 static int linux_get_freq(void)
924 {
925     struct cpufreq_policy * policy = NULL;
926     int cpu = get_cpu();
927
928     policy = palacios_alloc(sizeof(struct cpufreq_policy));
929     if (!policy) {
930         ERROR("Could not allocate policy struct\n");
931         return -1;
932     }
933
934     if (cpufreq_get_policy(policy, cpu)) {
935         ERROR("Could not get current policy\n");
936         return -1;
937     }
938
939     return policy->cur;
940 }
941
942 static void  
943 pstate_switch_workfn (struct work_struct *work)
944 {
945     pstate_work_t * pwork = (pstate_work_t*)work;
946     struct cpufreq_policy * policy = NULL;
947     int cpu = get_cpu();
948     put_cpu();
949
950     policy = palacios_alloc(sizeof(struct cpufreq_policy));
951     if (!policy) {
952         ERROR("Could not allocate space for cpufreq policy\n");
953         goto out;
954     }
955
956     if (cpufreq_get_policy(policy, cpu) != 0) {
957         ERROR("Could not get cpufreq policy\n");
958         goto out1;
959     }
960
961     INFO("P-state: setting frequency on core %u to %llu\n", cpu, pwork->freq);
962     cpufreq_driver_target(policy, pwork->freq, CPUFREQ_RELATION_H);
963
964     get_cpu_var(core_state).cur_freq_khz = pwork->freq;
965     put_cpu_var(core_state);
966
967 out1:
968     palacios_free(policy);
969 out:
970     palacios_free(work);
971
972
973
974 static int linux_set_pstate(uint8_t p)
975 {
976     struct cpufreq_policy * policy = NULL;
977     struct cpufreq_frequency_table *table;
978     pstate_work_t * work = NULL;
979     int cpu = get_cpu();
980     unsigned int i = 0;
981     unsigned int count = 0;
982     int state_set = 0;
983     int last_valid = 0;
984     put_cpu();
985
986     policy = palacios_alloc(sizeof(struct cpufreq_policy));
987     if (!policy) {
988         ERROR("Could not allocate policy struct\n");
989         return -1;
990     }
991
992     work = (pstate_work_t*)palacios_alloc(sizeof(pstate_work_t));
993     if (!work) {
994         ERROR("Could not allocate work struct\n");
995         goto out_err;
996     }
997
998     if (cpufreq_get_policy(policy, cpu)) {
999         ERROR("Could not get current policy\n");
1000         goto out_err1;
1001     }
1002     table = cpufreq_frequency_get_table(cpu);
1003
1004     for (i = 0; (table[i].frequency != CPUFREQ_TABLE_END); i++) {
1005
1006         if (table[i].frequency == CPUFREQ_ENTRY_INVALID) {
1007             continue;
1008         }
1009
1010         if (count == p) {
1011
1012             INIT_WORK((struct work_struct*)work, pstate_switch_workfn);
1013             work->freq = table[i].frequency;
1014             queue_work(pstate_wq, (struct work_struct*)work);
1015
1016             state_set = 1;
1017             break;
1018         }
1019
1020         count++;
1021         last_valid = i;
1022     }
1023
1024     /* we need to deal with the case in which we get a number > max pstate */
1025     if (!state_set) {
1026         INIT_WORK((struct work_struct*)work, pstate_switch_workfn);
1027         work->freq = table[last_valid].frequency;
1028         queue_work(pstate_wq, (struct work_struct*)work);
1029     }
1030
1031     palacios_free(policy);
1032     return 0;
1033
1034 out_err1: 
1035     palacios_free(work);
1036 out_err:
1037     palacios_free(policy);
1038     return -1;
1039 }
1040
1041
1042 static int linux_set_freq(uint64_t f)
1043 {
1044     struct cpufreq_policy * policy = NULL;
1045     pstate_work_t * work = NULL;
1046     uint64_t freq;
1047     int cpu = get_cpu();
1048     put_cpu();
1049
1050     policy = palacios_alloc(sizeof(struct cpufreq_policy));
1051     if (!policy) {
1052         ERROR("Could not allocate policy struct\n");
1053         return -1;
1054     }
1055
1056     work = (pstate_work_t*)palacios_alloc(sizeof(pstate_work_t));
1057     if (!work) {
1058         ERROR("Could not allocate work struct\n");
1059         goto out_err;
1060     }
1061
1062     if (cpufreq_get_policy(policy, cpu) != 0) {
1063         ERROR("Could not get cpufreq policy\n");
1064         goto out_err1;
1065     }
1066
1067     if (f < policy->min) {
1068         freq = policy->min;
1069     } else if (f > policy->max) {
1070         freq = policy->max;
1071     } else {
1072         freq = f;
1073     }
1074
1075     INIT_WORK((struct work_struct*)work, pstate_switch_workfn);
1076     work->freq = freq;
1077     queue_work(pstate_wq, (struct work_struct*)work);
1078
1079     palacios_free(policy);
1080     return 0;
1081
1082 out_err1:
1083     palacios_free(work);
1084 out_err:
1085     palacios_free(policy);
1086     return -1;
1087 }
1088
1089
1090 static int linux_restore_defaults(void)
1091 {
1092     unsigned int cpu = get_cpu();
1093     char * gov = NULL;
1094
1095     gov = get_cpu_var(core_state).linux_governor;
1096     put_cpu_var(core_state);
1097
1098     DEBUG("restoring previous governor (%s)\n", gov);
1099
1100     if (governor_switch(gov, cpu) < 0) {
1101         ERROR("Could not restore governor to (%s)\n", gov);
1102         goto out_err;
1103     }
1104
1105     free_linux_governor();
1106     return 0;
1107
1108 out_err:
1109     free_linux_governor();
1110     return -1;
1111 }
1112
1113
1114
1115 /******************************************************************
1116   Generic Interface as provided to Palacios and to the rest of the
1117   module
1118  ******************************************************************/
1119
1120 static void init_core(void)
1121 {
1122     unsigned cpu;
1123     struct cpufreq_policy *p;
1124     unsigned int i;
1125
1126
1127     DEBUG("P-State Core Init\n");
1128
1129     get_cpu_var(core_state).mode = V3_PSTATE_HOST_CONTROL;
1130     get_cpu_var(core_state).cur_pstate = 0;
1131
1132     if (machine_state.funcs) {
1133         get_cpu_var(core_state).min_pstate = machine_state.funcs->get_min_pstate();
1134         get_cpu_var(core_state).max_pstate = machine_state.funcs->get_max_pstate();
1135     } else {
1136         get_cpu_var(core_state).min_pstate = 0;
1137         get_cpu_var(core_state).max_pstate = 0;
1138     }
1139
1140
1141     cpu = get_cpu(); put_cpu();
1142
1143     p = cpufreq_cpu_get(cpu);
1144
1145     if (!p) { 
1146         get_cpu_var(core_state).have_cpufreq = 0;
1147         get_cpu_var(core_state).min_freq_khz=0;
1148         get_cpu_var(core_state).max_freq_khz=0;
1149         get_cpu_var(core_state).cur_freq_khz=0;
1150     } else {
1151         get_cpu_var(core_state).have_cpufreq = 1;
1152         get_cpu_var(core_state).min_freq_khz=p->min;
1153         get_cpu_var(core_state).max_freq_khz=p->max;
1154         get_cpu_var(core_state).cur_freq_khz=p->cur; } cpufreq_cpu_put(p); 
1155     put_cpu_var(core_state);
1156
1157     for (i=0;i<get_cpu_var(processors)->performance->state_count; i++) { 
1158         INFO("P-State: %u: freq=%llu ctrl=%llx",
1159                 i, 
1160                 get_cpu_var(processors)->performance->states[i].core_frequency*1000,
1161                 get_cpu_var(processors)->performance->states[i].control);
1162    }
1163    put_cpu_var(processors);
1164 }
1165
1166
1167 void palacios_pstate_ctrl_release(void);
1168
1169
1170 static void deinit_core(void)
1171 {
1172     DEBUG("P-State Core Deinit\n");
1173     palacios_pstate_ctrl_release();
1174
1175 }
1176
1177
1178
1179 void palacios_pstate_ctrl_get_chars(struct v3_cpu_pstate_chars *c) 
1180 {
1181     memset(c,0,sizeof(struct v3_cpu_pstate_chars));
1182
1183
1184     c->features = V3_PSTATE_INTERNAL_CONTROL;
1185
1186     if (get_cpu_var(core_state).have_cpufreq) {
1187         c->features |= V3_PSTATE_EXTERNAL_CONTROL;
1188     }
1189
1190     if (machine_state.arch==AMD || machine_state.arch==INTEL) { 
1191         c->features |= V3_PSTATE_DIRECT_CONTROL;
1192     }
1193     c->cur_mode = get_cpu_var(core_state).mode;
1194     c->min_pstate = get_cpu_var(core_state).min_pstate;
1195     c->max_pstate = get_cpu_var(core_state).max_pstate;
1196     c->cur_pstate = get_cpu_var(core_state).cur_pstate;
1197     c->min_freq_khz = get_cpu_var(core_state).min_freq_khz;
1198     c->max_freq_khz = get_cpu_var(core_state).max_freq_khz;
1199     c->cur_freq_khz = get_cpu_var(core_state).cur_freq_khz;
1200
1201     put_cpu_var(core_state);
1202
1203
1204
1205 }
1206
1207
1208 uint64_t palacios_pstate_ctrl_get_pstate(void)
1209 {
1210     if (get_cpu_var(core_state).mode==V3_PSTATE_DIRECT_CONTROL) { 
1211         put_cpu_var(core_state);
1212         return machine_state.funcs->get_pstate();
1213     } else if (get_cpu_var(core_state).mode==V3_PSTATE_EXTERNAL_CONTROL) {
1214         put_cpu_var(core_state);
1215         return linux_get_pstate();
1216     } else {
1217         put_cpu_var(core_state);
1218         return 0;
1219     }
1220 }
1221
1222
1223 void palacios_pstate_ctrl_set_pstate(uint64_t p)
1224 {
1225     if (get_cpu_var(core_state).mode==V3_PSTATE_DIRECT_CONTROL) { 
1226         put_cpu_var(core_state);
1227         machine_state.funcs->set_pstate(p);
1228     } else if (get_cpu_var(core_state).mode==V3_PSTATE_EXTERNAL_CONTROL) {
1229         put_cpu_var(core_state);
1230         linux_set_pstate(p);
1231     } else {
1232         put_cpu_var(core_state);
1233     }
1234 }
1235
1236
1237 void palacios_pstate_ctrl_set_pstate_wrapper(void *p)
1238 {
1239     palacios_pstate_ctrl_set_pstate((uint8_t)(uint64_t)p);
1240 }
1241
1242
1243 uint64_t palacios_pstate_ctrl_get_freq(void)
1244 {
1245     if (get_cpu_var(core_state).mode==V3_PSTATE_EXTERNAL_CONTROL) { 
1246         put_cpu_var(core_state);
1247         return linux_get_freq();
1248     } else {
1249         put_cpu_var(core_state);
1250         return 0;
1251     }
1252 }
1253
1254
1255 void palacios_pstate_ctrl_set_freq(uint64_t p)
1256 {
1257     if (get_cpu_var(core_state).mode==V3_PSTATE_EXTERNAL_CONTROL) { 
1258         put_cpu_var(core_state);
1259         linux_set_freq(p);
1260     } else {
1261         put_cpu_var(core_state);
1262     }
1263 }
1264
1265
1266 static int switch_to_external(void)
1267 {
1268     DEBUG("switch from host control to external\n");
1269
1270     if (!(get_cpu_var(core_state).have_cpufreq)) {
1271         put_cpu_var(core_state);
1272         ERROR("No cpufreq  - cannot switch to external...\n");
1273         return -1;
1274     } 
1275     put_cpu_var(core_state);
1276
1277     linux_setup_palacios_governor();
1278
1279     get_cpu_var(core_state).mode=V3_PSTATE_EXTERNAL_CONTROL;
1280     put_cpu_var(core_state);
1281
1282     return 0;
1283 }
1284
1285
1286 static int switch_to_direct(void)
1287 {
1288     DEBUG("switch from host control to direct\n");
1289
1290     if (get_cpu_var(core_state).have_cpufreq) { 
1291         put_cpu_var(core_state);
1292         DEBUG("switch to direct from cpufreq\n");
1293
1294         // The implementation would set the policy and governor to peg cpu
1295         // regardless of load
1296         linux_setup_palacios_governor();
1297     } else {
1298         put_cpu_var(core_state);
1299     }
1300
1301     if (machine_state.funcs && machine_state.funcs->arch_init) {
1302         get_cpu_var(core_state).mode=V3_PSTATE_DIRECT_CONTROL;
1303
1304         machine_state.funcs->arch_init();
1305
1306         put_cpu_var(core_state);
1307     }
1308
1309     return 0;
1310 }
1311
1312
1313 static int switch_to_internal(void)
1314 {
1315     DEBUG("switch from host control to internal\n");
1316
1317     if (get_cpu_var(core_state).have_cpufreq) { 
1318         put_cpu_var(core_state);
1319         DEBUG("switch to internal on machine with cpu freq\n");
1320         linux_setup_palacios_governor();
1321     } else {
1322         put_cpu_var(core_state);
1323     }
1324
1325     get_cpu_var(core_state).mode=V3_PSTATE_INTERNAL_CONTROL;
1326
1327     put_cpu_var(core_state);
1328
1329     return 0;
1330 }
1331
1332
1333 static int switch_from_external(void)
1334 {
1335     if (!(get_cpu_var(core_state).have_cpufreq)) {
1336         put_cpu_var(core_state);
1337         ERROR("No cpufreq  - how did we get here... external...\n");
1338         return -1;
1339     }
1340     put_cpu_var(core_state);
1341
1342     DEBUG("Switching back to host control from external\n");
1343
1344     if (get_cpu_var(core_state).have_cpufreq) { 
1345         put_cpu_var(core_state);
1346         linux_restore_defaults();
1347     } else {
1348         put_cpu_var(core_state);
1349     }
1350
1351     get_cpu_var(core_state).mode = V3_PSTATE_HOST_CONTROL;
1352     put_cpu_var(core_state);
1353
1354     return 0;
1355 }
1356
1357
1358 static int switch_from_direct(void)
1359 {
1360
1361     DEBUG("Switching back to host control from direct\n");
1362
1363     // Set maximum performance, just in case there is no host control
1364     machine_state.funcs->set_pstate(get_cpu_var(core_state).min_pstate);
1365     machine_state.funcs->arch_deinit();
1366
1367     if (get_cpu_var(core_state).have_cpufreq) { 
1368         put_cpu_var(core_state);
1369         linux_restore_defaults();
1370     } else {
1371         put_cpu_var(core_state);
1372     }
1373
1374     get_cpu_var(core_state).mode=V3_PSTATE_HOST_CONTROL;
1375
1376     put_cpu_var(core_state);
1377
1378     return 0;
1379 }
1380
1381
1382 static int switch_from_internal(void)
1383 {
1384     DEBUG("Switching back to host control from internal\n");
1385
1386     if (get_cpu_var(core_state).have_cpufreq) { 
1387         put_cpu_var(core_state);
1388         // ERROR("Unimplemented: switch from internal on machine with cpu freq - will just pretend to do so\n");
1389         // The implementation would switch back to default policy and governor
1390         linux_restore_defaults();
1391     } else {
1392         put_cpu_var(core_state);
1393     }
1394
1395     get_cpu_var(core_state).mode=V3_PSTATE_HOST_CONTROL;
1396
1397     put_cpu_var(core_state);
1398
1399     return 0;
1400 }
1401
1402
1403
1404 void palacios_pstate_ctrl_acquire(uint32_t type)
1405 {
1406     if (get_cpu_var(core_state).mode != V3_PSTATE_HOST_CONTROL) { 
1407         put_cpu_var(core_state);
1408         palacios_pstate_ctrl_release();
1409     } else {
1410         put_cpu_var(core_state);
1411     }
1412
1413     switch (type) { 
1414         case V3_PSTATE_EXTERNAL_CONTROL:
1415             switch_to_external();
1416             break;
1417         case V3_PSTATE_DIRECT_CONTROL:
1418             switch_to_direct();
1419             break;
1420         case V3_PSTATE_INTERNAL_CONTROL:
1421             switch_to_internal();
1422             break;
1423         default:
1424             ERROR("Unknown pstate control type %u\n",type);
1425             break;
1426     }
1427
1428 }
1429
1430 // Wrappers for xcalls
1431 static void palacios_pstate_ctrl_acquire_external(void)
1432 {
1433     palacios_pstate_ctrl_acquire(V3_PSTATE_EXTERNAL_CONTROL);
1434 }
1435
1436 static void palacios_pstate_ctrl_acquire_direct(void)
1437 {
1438     palacios_pstate_ctrl_acquire(V3_PSTATE_DIRECT_CONTROL);
1439 }
1440
1441
1442 void palacios_pstate_ctrl_release(void)
1443 {
1444     if (get_cpu_var(core_state).mode == V3_PSTATE_HOST_CONTROL) { 
1445         put_cpu_var(core_state);
1446         return;
1447     } 
1448     put_cpu_var(core_state);
1449
1450     switch (get_cpu_var(core_state).mode) { 
1451         case V3_PSTATE_EXTERNAL_CONTROL:
1452             put_cpu_var(core_state);
1453             switch_from_external();
1454             break;
1455         case V3_PSTATE_DIRECT_CONTROL:
1456             put_cpu_var(core_state);
1457             switch_from_direct();
1458             break;
1459         case V3_PSTATE_INTERNAL_CONTROL:
1460             put_cpu_var(core_state);
1461             switch_from_internal();
1462             break;
1463         default:
1464             put_cpu_var(core_state);
1465             ERROR("Unknown pstate control type %u\n",core_state.mode);
1466             break;
1467     }
1468 }
1469
1470
1471 static void update_hw_pstate(void *arg)
1472 {
1473     if (machine_state.funcs && machine_state.funcs->get_pstate) {
1474         get_cpu_var(core_state).cur_hw_pstate = machine_state.funcs->get_pstate();
1475         put_cpu_var(core_state);
1476     } else {
1477         get_cpu_var(core_state).cur_hw_pstate = 0;
1478         put_cpu_var(core_state);
1479     }
1480 }
1481
1482
1483 /***************************************************************************
1484   PROC Interface to expose state
1485  ***************************************************************************/
1486
1487 static int pstate_show(struct seq_file * file, void * v)
1488 {
1489     unsigned int cpu;
1490     unsigned int numcpus = num_online_cpus();
1491
1492     seq_printf(file, "V3VEE DVFS Status\n\n");
1493
1494     for (cpu=0;cpu<numcpus;cpu++) { 
1495         palacios_xcall(cpu,update_hw_pstate,0);
1496     }
1497
1498     seq_printf(file, "Arch:\t%s\nPStates:\t%s\n\n",
1499             machine_state.arch==INTEL ? "Intel" : 
1500             machine_state.arch==AMD ? "AMD" : "Other",
1501             machine_state.supports_pstates ? "Yes" : "No");
1502
1503     for (cpu=0;cpu<numcpus;cpu++) { 
1504         struct pstate_core_info *s = &per_cpu(core_state,cpu);
1505         seq_printf(file,"pcore %u: hw pstate 0x%x mode %s of [ host ",cpu,
1506                 s->cur_hw_pstate,
1507                 s->mode==V3_PSTATE_HOST_CONTROL ? "host" :
1508                 s->mode==V3_PSTATE_EXTERNAL_CONTROL ? "external" :
1509                 s->mode==V3_PSTATE_DIRECT_CONTROL ? "direct" : 
1510                 s->mode==V3_PSTATE_INTERNAL_CONTROL ? "internal" : "UNKNOWN");
1511         if (s->have_cpufreq) { 
1512             seq_printf(file,"external ");
1513         }
1514         if (machine_state.supports_pstates) {
1515             seq_printf(file,"direct ");
1516         }
1517         seq_printf(file,"internal ] ");
1518         if (s->mode==V3_PSTATE_EXTERNAL_CONTROL) { 
1519             seq_printf(file,"(min=%llu max=%llu cur=%llu) ", s->min_freq_khz, s->max_freq_khz, s->cur_freq_khz);
1520         } 
1521         if (s->mode==V3_PSTATE_DIRECT_CONTROL) { 
1522             seq_printf(file,"(min=%u max=%u cur=%u) ", (uint32_t)s->min_pstate, (uint32_t)s->max_pstate, (uint32_t)s->cur_pstate);
1523         }
1524         seq_printf(file,"\n");
1525     }
1526     return 0;
1527 }
1528
1529 static int pstate_open(struct inode * inode, struct file * file) 
1530 {
1531     return single_open(file, pstate_show, NULL);
1532 }
1533
1534
1535 static struct file_operations pstate_fops = {
1536     .owner = THIS_MODULE,
1537     .open = pstate_open, 
1538     .read = seq_read,
1539     .llseek = seq_lseek,
1540     .release = seq_release
1541 };
1542
1543 int pstate_proc_setup(void)
1544 {
1545     struct proc_dir_entry *proc;
1546
1547     proc = create_proc_entry("v3-dvfs",0444, palacios_get_procdir());
1548
1549     if (!proc) { 
1550         ERROR("Failed to create proc entry for p-state control\n");
1551         return -1;
1552     }
1553
1554     proc->proc_fops = &pstate_fops;
1555
1556     return 0;
1557 }
1558
1559 void pstate_proc_teardown(void)
1560 {
1561     remove_proc_entry("v3-dvfs",palacios_get_procdir());
1562 }
1563
1564 /********************************************************************
1565   User interface (ioctls)
1566  ********************************************************************/
1567
1568 static int dvfs_ctrl(unsigned int cmd, unsigned long arg) 
1569 {
1570     struct v3_dvfs_ctrl_request r;
1571
1572     if (copy_from_user(&r,(void __user*)arg,sizeof(struct v3_dvfs_ctrl_request))) {
1573         ERROR("Failed to copy DVFS request from user\n");
1574         return -EFAULT;
1575     }
1576
1577     if (r.pcore >= num_online_cpus()) {
1578         ERROR("Cannot apply DVFS request to pcore %u\n",r.pcore);
1579         return -EFAULT;
1580     }
1581
1582     switch (r.cmd) {
1583         case V3_DVFS_ACQUIRE: {
1584                                   switch (r.acq_type) { 
1585                                       case V3_DVFS_EXTERNAL:
1586                                           palacios_xcall(r.pcore,(void (*)(void*))palacios_pstate_ctrl_acquire_external, NULL);
1587                                           return 0;
1588                                           break;
1589                                       case V3_DVFS_DIRECT:
1590                                           palacios_xcall(r.pcore,(void (*)(void*))palacios_pstate_ctrl_acquire_direct, NULL);
1591                                           return 0;
1592                                           break;
1593                                       default:
1594                                           ERROR("Unknown DVFS acquire type %u\n",r.acq_type);
1595                                           return -EFAULT;
1596                                   }
1597                               }
1598                               break;
1599         case V3_DVFS_RELEASE: {
1600                                   palacios_xcall(r.pcore,(void (*)(void*))palacios_pstate_ctrl_release, NULL);
1601                                   return 0;
1602                               }
1603                               break;
1604         case V3_DVFS_SETFREQ: {
1605                                   palacios_xcall(r.pcore,(void (*)(void*))palacios_pstate_ctrl_set_freq,(void*)r.freq_khz);
1606                                   return 0;
1607                               }
1608                               break;
1609         case V3_DVFS_SETPSTATE: {
1610                                     palacios_xcall(r.pcore,(void (*)(void*))palacios_pstate_ctrl_set_pstate_wrapper,(void*)(uint64_t)r.pstate);
1611                                     return 0;
1612                                 }
1613         default: {
1614                      ERROR("Unknown DVFS command %u\n",r.cmd);
1615                      return -EFAULT;
1616                  }
1617                  break;
1618     }
1619 }
1620
1621
1622 void pstate_user_setup(void)
1623 {
1624     add_global_ctrl(V3_DVFS_CTRL, dvfs_ctrl);
1625 }
1626
1627
1628 void pstate_user_teardown(void)
1629 {
1630     remove_global_ctrl(V3_DVFS_CTRL);
1631 }
1632
1633 static struct v3_host_pstate_ctrl_iface hooks = {
1634     .get_chars = palacios_pstate_ctrl_get_chars,
1635     .acquire = palacios_pstate_ctrl_acquire,
1636     .release = palacios_pstate_ctrl_release,
1637     .set_pstate = palacios_pstate_ctrl_set_pstate,
1638     .get_pstate = palacios_pstate_ctrl_get_pstate,
1639     .set_freq = palacios_pstate_ctrl_set_freq,
1640     .get_freq = palacios_pstate_ctrl_get_freq,
1641 };
1642
1643
1644
1645 static int pstate_ctrl_init(void) 
1646 {
1647     unsigned int cpu;
1648     unsigned int numcpus = num_online_cpus();
1649
1650     pstate_arch_setup();
1651
1652     for (cpu=0;cpu<numcpus;cpu++) { 
1653         palacios_xcall(cpu,(void ((*)(void*)))init_core,0);
1654     }
1655
1656     V3_Init_Pstate_Ctrl(&hooks);  
1657
1658     if (pstate_proc_setup()) { 
1659         ERROR("Unable to initialize P-State Control\n");
1660         return -1;
1661     }
1662
1663     pstate_user_setup();
1664
1665     pstate_linux_init();
1666
1667     INFO("P-State Control Initialized\n");
1668
1669     return 0;
1670 }
1671
1672 static int pstate_ctrl_deinit(void)
1673 {
1674     unsigned int cpu;
1675     unsigned int numcpus=num_online_cpus();
1676
1677     pstate_linux_deinit();
1678
1679     pstate_user_teardown();
1680
1681     pstate_proc_teardown();
1682
1683     // release pstate control if we have it, and we need to do this on each processor
1684     for (cpu=0;cpu<numcpus;cpu++) { 
1685         palacios_xcall(cpu,(void (*)(void *))deinit_core,0);
1686     }
1687
1688
1689     // Free any mapping table we built for Intel
1690     if (intel_pstate_to_ctrl && intel_pstate_to_ctrl != intel_pstate_to_ctrl_internal) { 
1691         palacios_free(intel_pstate_to_ctrl);
1692     }
1693
1694
1695     return 0;
1696 }
1697
1698
1699 static struct linux_ext pstate_ext = {
1700     .name = "PSTATE_CTRL",
1701     .init = pstate_ctrl_init,
1702     .deinit = pstate_ctrl_deinit,
1703     .guest_init = NULL,
1704     .guest_deinit = NULL,
1705 };
1706
1707
1708 register_extension(&pstate_ext);
1709
1710
1711