Palacios Public Git Repository

To checkout Palacios execute

  git clone http://v3vee.org/palacios/palacios.web/palacios.git
This will give you the master branch. You probably want the devel branch or one of the release branches. To switch to the devel branch, simply execute
  cd palacios
  git checkout --track -b devel origin/devel
The other branches are similar.


added linux cpufreq interface to dvfs code.
[palacios.git] / linux_module / iface-pstate-ctrl.c
1 /*
2  * This file is part of the Palacios Virtual Machine Monitor developed
3  * by the V3VEE Project with funding from the United States National 
4  * Science Foundation and the Department of Energy.  
5  *
6  * The V3VEE Project is a joint project between Northwestern University
7  * and the University of New Mexico.  You can find out more at 
8  * http://www.v3vee.org
9  *
10  * Copyright (c) 2014, the V3VEE Project <http://www.v3vee.org>
11  * all rights reserved.
12  *
13  * Author: Kyle C. Hale <kh@u.northwestern.edu>
14  *         Shiva Rao <shiva.rao.717@gmail.com>
15  *         Peter Dinda <pdinda@northwestern.edu>
16  *
17  * This is free software.  you are permitted to use,
18  * redistribute, and modify it as specified in the file "V3VEE_LICENSE".
19  */
20
21 #include <linux/uaccess.h>
22 #include <linux/seq_file.h>
23 #include <linux/proc_fs.h>
24 #include <linux/export.h>
25 #include <linux/cpufreq.h>
26 #include <linux/kernel.h>
27 #include <linux/kmod.h>
28 #include <linux/string.h>
29 #include <asm/processor.h>
30 #include <asm/msr.h>
31 #include <asm/msr-index.h>
32
33 #include <interfaces/vmm_pstate_ctrl.h>
34
35 #include "palacios.h"
36 #include "iface-pstate-ctrl.h"
37
38 #include "linux-exts.h"
39
40 /*
41    This P-STATE control implementation includes:
42
43    - Direct control of Intel and AMD processor pstates
44    - External control of processor states via Linux (unimplemented)
45    - Internal control of processor states in Palacios (handoff from Linux)
46
47    Additionally, it provides a user-space interface for manipulating
48    p-state regardless of the host's functionality.  This includes
49    an ioctl for commanding the implementation and a /proc file for 
50    showing current status and capabilities.
51
52 */
53
54
55 #define PALACIOS_GOVNAME "v3vee"
56 #define MAX_PATH_LEN     128
57 #define MAX_GOV_NAME_LEN 16
58
59
60 struct pstate_core_info {
61     // Here we have the notion of host control
62 #define V3_PSTATE_HOST_CONTROL 0
63     // and all the modes from the Palacios interface:
64     // V3_PSTATE_EXTERNAL_CONTROL
65     // V3_PSTATE_DIRECT_CONTROL
66     // V3_PSTATE_INTERNAL_CONTROL
67     uint32_t mode;
68
69     // Apply if we are under the DIRECT state
70     uint8_t cur_pstate;
71     uint8_t max_pstate;
72     uint8_t min_pstate;
73
74     uint8_t cur_hw_pstate;
75
76     // Apply if we are under the EXTERNAL state
77     uint64_t cur_freq_khz;
78     uint64_t max_freq_khz;
79     uint64_t min_freq_khz;
80
81     // Intel-specific
82     uint8_t prior_speedstep;
83     uint8_t turbo_disabled;
84     uint8_t no_turbo;
85
86     int have_cpufreq;
87
88     // This is where we stash Linux's governor when we make a mode switch
89     char * linux_governor;
90     // We have this so we can restore the original frequency when we started
91     uint64_t original_hz; 
92
93 };
94
95
96 static DEFINE_PER_CPU(struct pstate_core_info, core_state);
97
98
99
100 // These are used to assert DIRECT control over the core pstates
101 struct pstate_core_funcs {
102     void    (*arch_init)(void);
103     void    (*arch_deinit)(void);
104     uint8_t (*get_min_pstate)(void);
105     uint8_t (*get_max_pstate)(void);
106     uint8_t (*get_pstate)(void);
107     void    (*set_pstate)(uint8_t pstate);
108 };
109
110 struct pstate_machine_info {
111     enum {INTEL, AMD, OTHER } arch;
112     int supports_pstates;
113
114
115     // For AMD
116     int have_pstate;
117     int have_coreboost;
118     int have_feedback;  
119
120     // For Intel
121     int have_speedstep;
122     int have_opportunistic; // this means "Turbo Boost" or "IDA"
123     int have_policy_hint;
124     int have_hwp;       // hardware-controlled performance states
125     int have_hdc;       // hardware duty cycling
126     int have_mwait_ext; // mwait power extensions
127     int have_mwait_int; // mwait wakes on interrupt
128
129     // for both
130     int have_pstate_hw_coord;  // mperf/aperf
131
132     // used for DIRECT control
133     struct pstate_core_funcs *funcs;
134
135 };
136
137 static struct pstate_machine_info machine_state;
138
139
140 /****************************************************
141   AMD  DIRECT CONTROL
142  ***************************************************/
143
144 /* AMD Programmer's Manual Vol 2 (Rev 3, 2013), Sec. 17.1, pp.557 */
145 #define MSR_PSTATE_LIMIT_REG_AMD 0xc0010061
146 #define MSR_PSTATE_CTL_REG_AMD   0xc0010062
147 #define MSR_PSTATE_STAT_REG_AMD  0xc0010063
148
149 struct p_state_limit_reg_amd {
150     union {
151         uint64_t val;
152         struct {
153             uint8_t  pstate_limit : 4; /* lowest P-state value (highest perf.) supported currently (this can change at runtime) */
154             uint8_t  pstate_max   : 4; /* highest P-state value supported  (lowest perf) */
155             uint64_t rsvd         : 56;
156         } reg;
157     } __attribute__((packed));
158 } __attribute__((packed));
159
160
161 struct p_state_stat_reg_amd {
162     union {
163         uint64_t val;
164         struct {
165             uint8_t  pstate  : 4;
166             uint64_t rsvd    : 60;
167         } reg;
168     } __attribute__((packed));
169 } __attribute__((packed));
170
171
172 struct p_state_ctl_reg_amd {
173     union {
174         uint64_t val;
175         struct {
176             uint8_t  cmd  : 4;
177             uint64_t rsvd : 60;
178         } reg;
179     } __attribute__((packed));
180 } __attribute__((packed));
181
182
183 /* CPUID Fn8000_0007_EDX[HwPstate(7)] = 1 */
184 static uint8_t supports_pstates_amd (void)
185 {
186     uint32_t eax, ebx, ecx, edx;
187
188     cpuid(0x80000007, &eax, &ebx, &ecx, &edx);
189     machine_state.have_pstate = !!(edx & (1 << 7));
190     machine_state.have_coreboost = !!(edx & (1<<9));
191     machine_state.have_feedback = !!(edx & (1<<11));
192
193     cpuid(0x6, &eax, &ebx, &ecx, &edx);
194     machine_state.have_pstate_hw_coord =  !!(ecx & 1); 
195
196     INFO("P-State: AMD: Pstates=%d Coreboost=%d Feedback=%d PstateHWCoord=%d\n",
197             machine_state.have_pstate, 
198             machine_state.have_coreboost, 
199             machine_state.have_feedback,
200             machine_state.have_pstate_hw_coord);
201
202     return machine_state.have_pstate;
203
204
205 }
206
207
208 static void init_arch_amd(void)
209 {
210     /* KCH: nothing to do here */
211 }
212
213
214 static void deinit_arch_amd(void)
215 {
216     /* KCH: nothing to do here */
217 }
218
219
220 static uint8_t get_pstate_amd(void) 
221 {
222     struct p_state_stat_reg_amd pstat;
223
224     rdmsrl(MSR_PSTATE_STAT_REG_AMD, pstat.val);
225
226     get_cpu_var(core_state).cur_pstate=pstat.reg.pstate;
227     put_cpu_var(core_state);
228
229     return pstat.reg.pstate;
230 }
231
232
233 static void set_pstate_amd(uint8_t p)
234 {
235     struct p_state_ctl_reg_amd pctl;
236     pctl.val = 0;
237     pctl.reg.cmd = p;
238
239     wrmsrl(MSR_PSTATE_CTL_REG_AMD, pctl.val);
240
241     get_cpu_var(core_state).cur_pstate=p;
242     put_cpu_var(core_state);
243 }
244
245
246 /*
247  * NOTE: HW may change this value at runtime
248  */
249 static uint8_t get_max_pstate_amd(void)
250 {
251     struct p_state_limit_reg_amd plimits;
252
253     rdmsrl(MSR_PSTATE_LIMIT_REG_AMD, plimits.val);
254
255     return plimits.reg.pstate_max;
256 }
257
258
259 static uint8_t get_min_pstate_amd(void)
260 {
261     struct p_state_limit_reg_amd plimits;
262
263     rdmsrl(MSR_PSTATE_LIMIT_REG_AMD, plimits.val);
264
265     return plimits.reg.pstate_limit;
266 }
267
268
269 static struct pstate_core_funcs amd_funcs =
270 {
271     .arch_init        = init_arch_amd,
272     .arch_deinit      = deinit_arch_amd,
273     .get_pstate       = get_pstate_amd,
274     .set_pstate       = set_pstate_amd,
275     .get_max_pstate   = get_max_pstate_amd,
276     .get_min_pstate   = get_min_pstate_amd,
277 };
278
279
280
281 /***********************************************************
282   INTEL DIRECT CONTROL
283  **********************************************************/
284
285
286 /*
287    This implementation uses SpeedStep, but does check
288    to see if the other features (MPERF/APERF, Turbo/IDA, HWP)
289    are available.
290    */
291
292 /* Intel System Programmer's Manual Vol. 3B, 14-2 */
293 #define MSR_MPERF_IA32         0x000000e7
294 #define MSR_APERF_IA32         0x000000e8
295 #define MSR_MISC_ENABLE_IA32   0x000001a0
296 #define MSR_NHM_TURBO_RATIO_LIMIT   0x000001ad
297 #define MSR_PLATFORM_INFO_IA32 0x000000ce
298 #define MSR_PERF_CTL_IA32      0x00000199
299 #define MSR_PERF_STAT_IA32     0x00000198
300 #define MSR_ENERY_PERF_BIAS_IA32 0x000001b0
301
302
303 /* Note that the actual  meaning of the pstate
304    in the control and status registers is actually
305    implementation dependent, unlike AMD.   The "official"
306    way to figure it out the mapping from pstate to 
307    these values is via ACPI.  What is written in the register
308    is an "id" of an operation point
309
310    "Often", the 16 bit field consists of a high order byte
311    which is the frequency (the multiplier) and the low order
312    byte is the voltage. 
313    */
314 // MSR_PERF_CTL_IA32  r/w
315 struct perf_ctl_reg_intel {
316     union {
317         uint64_t val;
318         struct {
319             // This is the target
320             // Note, not the ACPI pstate, but
321             // Intel's notion of pstate is that it's opaque
322             // for lots of implementations it seems to be
323             // frequency_id : voltage_id
324             // where frequency_id is typically the multiplier
325             uint16_t pstate                 : 16;
326             uint16_t reserved               : 16;
327             // set to 1 to *disengage* dynamic acceleration
328             // Note that "IDA" and "Turbo" use the same interface
329             uint16_t dynamic_accel_disable  : 1;
330             uint32_t reserved2              : 31;
331         } reg;
332     } __attribute__((packed));
333 } __attribute__((packed));
334
335 // MSR_PERF_STAT_IA32 r
336 struct perf_stat_reg_intel {
337     union {
338         uint64_t val;
339         struct {
340             // this is the current
341             uint16_t pstate                 : 16;
342             uint64_t reserved               : 48;
343         } reg;
344     } __attribute__((packed));
345 } __attribute__((packed));
346
347 // MSR_ENERGY_PERF_BIAS_IA32 r/w
348 struct enery_perf_bias_reg_intel {
349     union {
350         uint64_t val;
351         struct {
352             // this is the current
353             uint8_t  policy_hint            : 4;
354             uint64_t reserved               : 60;
355         } reg;
356     } __attribute__((packed));
357 } __attribute__((packed));
358
359 // MSR_PLATFORM_INFO
360 struct turbo_mode_info_reg_intel {
361     union {
362         uint64_t val;
363         struct {
364             uint8_t  rsvd0                  : 8;
365             uint8_t  max_noturbo_ratio      : 8;
366             uint8_t  rsvd1                  : 7;
367             uint8_t  ppin_cap               : 1;
368             uint8_t  rsvd2                  : 4;
369             uint8_t  ratio_limit            : 1; 
370             uint8_t  tdc_tdp_limit          : 1;
371             uint16_t rsvd3                  : 10;
372             uint8_t  min_ratio              : 8;
373             uint16_t rsvd4                  : 16;
374         } reg;
375     } __attribute__((packed));
376 } __attribute__((packed));
377
378
379 /* CPUID.01:ECX.AES(7) */
380 static uint8_t supports_pstates_intel(void)
381 {
382     /* NOTE: CPUID.06H:ECX.SETBH[bit 3] is set and it also implies the presence of a new architectural MSR called IA32_ENERGY_PERF_BIAS (1B0H).
383     */
384     uint32_t eax, ebx, ecx, edx;
385
386     cpuid(0x1, &eax, &ebx, &ecx, &edx);
387     machine_state.have_speedstep =  !!(ecx & (1 << 7));
388
389     cpuid(0x6, &eax, &ebx, &ecx, &edx);
390     machine_state.have_pstate_hw_coord =  !!(ecx & 1); // ?
391     machine_state.have_opportunistic =  !!(eax & 1<<1);
392     machine_state.have_policy_hint = !!(ecx & 1<<3);
393     machine_state.have_hwp = !!(eax & 1<<7);
394     machine_state.have_hdc = !!(eax & 1<<13);
395
396     cpuid(0x5, &eax, &ebx, &ecx, &edx);
397     machine_state.have_mwait_ext =  !!(ecx & 1);
398     machine_state.have_mwait_int =  !!(ecx & 1<<1);
399
400
401     INFO("P-State: Intel: Speedstep=%d, PstateHWCoord=%d, Opportunistic=%d PolicyHint=%d HWP=%d HDC=%d, MwaitExt=%d MwaitInt=%d \n",
402             machine_state.have_speedstep, 
403             machine_state.have_pstate_hw_coord, 
404             machine_state.have_opportunistic,
405             machine_state.have_policy_hint,
406             machine_state.have_hwp,
407             machine_state.have_hdc,
408             machine_state.have_mwait_ext,
409             machine_state.have_mwait_int );
410
411     return machine_state.have_speedstep;
412 }
413
414
415 static void init_arch_intel(void)
416 {
417     uint64_t val;
418
419     rdmsrl(MSR_MISC_ENABLE_IA32, val);
420
421     // store prior speedstep setting
422     get_cpu_var(core_state).prior_speedstep=(val >> 16) & 0x1;
423     put_cpu_var(core_state);
424
425     // enable speedstep (probably already on)
426     val |= 1 << 16;
427     wrmsrl(MSR_MISC_ENABLE_IA32, val);
428
429 }
430
431 static void deinit_arch_intel(void)
432 {
433     uint64_t val;
434
435     rdmsrl(MSR_MISC_ENABLE_IA32, val);
436
437     val &= ~(1ULL << 16);
438     val |= get_cpu_var(core_state).prior_speedstep << 16;
439     put_cpu_var(core_state);
440
441     wrmsrl(MSR_MISC_ENABLE_IA32, val);
442
443 }
444
445 /* TODO: Intel P-states require sampling at intervals... */
446 static uint8_t get_pstate_intel(void)
447 {
448     uint64_t val;
449     uint16_t pstate;
450
451     rdmsrl(MSR_PERF_STAT_IA32,val);
452
453     pstate = val & 0xffff;
454
455     INFO("P-State: Get: 0x%llx\n", val);
456
457     // Assume top byte is the FID
458     //if (pstate & 0xff ) { 
459     //  ERROR("P-State: Intel returns confusing pstate %u\n",pstate);
460     //}
461
462     // should check if turbo is active, in which case 
463     // this value is not the whole story
464
465     return (uint8_t) (pstate>>8);
466 }
467
468 static void set_pstate_intel(uint8_t p)
469 {
470     uint64_t val;
471
472     /* ...Intel IDA (dynamic acceleration)
473        if (c->no_turbo && !c->turbo_disabled) {
474        val |= 1 << 32;
475        }
476        */
477     // leave all bits along expect for the likely
478     // fid bits
479
480     rdmsrl(MSR_PERF_CTL_IA32, val);
481     val &= ~0xff00ULL;
482     val |= ((uint64_t)p)<<8;
483
484     INFO("P-State: Set: 0x%llx\n", val);
485
486     wrmsrl(MSR_PERF_CTL_IA32, val);
487
488     get_cpu_var(core_state).cur_pstate = p;
489     put_cpu_var(core_state);
490 }
491
492
493 static uint8_t get_min_pstate_intel(void)
494 {
495     struct turbo_mode_info_reg_intel t;
496
497     rdmsrl(MSR_PLATFORM_INFO_IA32, t.val);
498
499     return t.reg.min_ratio;
500 }
501
502
503
504 static uint8_t get_max_pstate_intel (void)
505 {
506     struct turbo_mode_info_reg_intel t;
507
508     rdmsrl(MSR_PLATFORM_INFO_IA32, t.val);
509
510     return t.reg.max_noturbo_ratio;
511 }
512
513 static struct pstate_core_funcs intel_funcs =
514 {
515     .arch_init        = init_arch_intel,
516     .arch_deinit      = deinit_arch_intel,
517     .get_pstate       = get_pstate_intel,
518     .set_pstate       = set_pstate_intel,
519     .get_max_pstate   = get_max_pstate_intel,
520     .get_min_pstate   = get_min_pstate_intel,
521 };
522
523
524
525 /***********************************************
526   Arch determination and setup
527  ***********************************************/
528
529 static inline void cpuid_string (uint32_t id, uint32_t dest[4]) 
530 {
531     asm volatile("cpuid"
532             :"=a"(*dest),"=b"(*(dest+1)),"=c"(*(dest+2)),"=d"(*(dest+3))
533             :"a"(id));
534 }
535
536
537 static int get_cpu_vendor (char name[13])
538 {
539     uint32_t dest[4];
540     uint32_t maxid;
541
542     cpuid_string(0,dest);
543     maxid=dest[0];
544     ((uint32_t*)name)[0]=dest[1];
545     ((uint32_t*)name)[1]=dest[3];
546     ((uint32_t*)name)[2]=dest[2];
547     name[12]=0;
548
549     return maxid;
550 }
551
552
553 static int is_intel (void)
554 {
555     char name[13];
556     get_cpu_vendor(name);
557     return !strcmp(name,"GenuineIntel");
558 }
559
560
561 static int is_amd (void)
562 {
563     char name[13];
564     get_cpu_vendor(name);
565     return !strcmp(name,"AuthenticAMD");
566 }
567
568 static int pstate_arch_setup(void)
569 {
570
571     if (is_amd()) {
572         machine_state.arch = AMD;
573         machine_state.funcs = &amd_funcs;
574         machine_state.supports_pstates = supports_pstates_amd();
575         INFO("PSTATE: P-State initialized for AMD\n");
576     } else if (is_intel()) {
577         machine_state.arch  = INTEL;
578         machine_state.funcs = &intel_funcs;
579         machine_state.supports_pstates = supports_pstates_intel();
580         INFO("PSTATE: P-State initialized for INTEL (Work in progress...)\n");
581         return 0;
582
583     } else {
584         machine_state.arch = OTHER;
585         machine_state.funcs = NULL;
586         machine_state.supports_pstates = 0;
587         INFO("PSTATE: P-state control: No support for direct control on this architecture\n");
588         return 0;
589     }
590
591     return 0;
592 }
593
594
595
596 /******************************************************************
597   Linux Interface
598  *****************************************************************/
599
600
601 /* 
602  * This stub governor is simply a placeholder for preventing 
603  * frequency changes from the Linux side. For now, we simply leave
604  * the frequency as is when we acquire control. 
605  */
606 static int governor_run(struct cpufreq_policy *policy, unsigned int event)
607 {
608
609     switch (event) {
610         /* we can't use cpufreq_driver_target here as it can result
611          * in a circular dependency, so we'll just do nothing.
612          */
613         case CPUFREQ_GOV_START:
614         case CPUFREQ_GOV_STOP:
615         case CPUFREQ_GOV_LIMITS:
616             /* do nothing */
617             break;
618         default:
619             ERROR("Undefined governor command\n");
620             return -1;
621     }                           
622
623     return 0;
624 }
625
626
627 static struct cpufreq_governor stub_governor = 
628 {
629     .name = PALACIOS_GOVNAME,
630     .governor = governor_run,
631     .owner = THIS_MODULE,
632 };
633
634
635 static inline void pstate_register_linux_governor(void)
636 {
637     cpufreq_register_governor(&stub_governor);
638 }
639
640
641 static inline void pstate_unregister_linux_governor(void)
642 {
643     cpufreq_unregister_governor(&stub_governor);
644 }
645
646
647 static int get_current_governor(char **buf, unsigned int cpu)
648 {
649     struct cpufreq_policy * policy = palacios_alloc(sizeof(struct cpufreq_policy));
650     char * govname = NULL;
651
652     if (!policy) {
653         ERROR("could not allocate cpufreq_policy\n");
654         return -1;
655     }
656         
657     if (cpufreq_get_policy(policy, cpu) != 0) {
658         ERROR("Could not get current cpufreq policy\n");
659         goto out_err;
660     }
661
662     /* We're in interrupt context, should probably not wait here */
663     govname = palacios_alloc(MAX_GOV_NAME_LEN);
664     if (!govname) {
665         ERROR("Could not allocate space for governor name\n");
666         goto out_err;
667     }
668
669     strncpy(govname, policy->governor->name, MAX_GOV_NAME_LEN);
670
671     get_cpu_var(core_state).linux_governor = govname;
672     put_cpu_var(core_state);
673
674     *buf = govname;
675
676     palacios_free(policy);
677
678     return 0;
679
680 out_err:
681     palacios_free(policy);
682     return -1;
683 }
684
685
686 /* passed to the userspacehelper interface for cleanup */
687 static void gov_switch_cleanup(struct subprocess_info * s)
688 {
689     palacios_free(s->argv[2]);
690     palacios_free(s->argv);
691 }
692
693
694 /* 
695  * Switch governors
696  * @s - the governor to switch to 
697  */
698 static int governor_switch(char * s, unsigned int cpu)
699 {
700     char * path_str = NULL;
701     char ** argv = NULL; 
702
703     static char * envp[] = {
704         "HOME=/",
705         "TERM=linux",
706         "PATH=/sbin:/bin:/usr/sbin:/usr/bin", NULL };
707
708
709     argv = palacios_alloc(4*sizeof(char*));
710     if (!argv) {
711         ERROR("Couldn't allocate argv struct\n");
712         return -1;
713     }
714
715     path_str = palacios_alloc(MAX_PATH_LEN);
716     if (!path_str) {
717         ERROR("Couldn't allocate path string\n");
718         goto out_freeargv;
719     }
720     memset(path_str, 0, MAX_PATH_LEN);
721
722     snprintf(path_str, MAX_PATH_LEN, "echo %s > /sys/devices/system/cpu/cpu%u/cpufreq/scaling_governor", s, cpu);
723
724     argv[0] = "/bin/sh";
725     argv[1] = "-c";
726     argv[2] = path_str;
727     argv[3] = NULL;
728
729     /* KCH: we can't wait here to actually see if we succeeded, we're in interrupt context */
730     return call_usermodehelper_fns("/bin/sh", argv, envp, UMH_NO_WAIT, NULL, gov_switch_cleanup, NULL);
731
732 out_freeargv:
733     palacios_free(argv);
734     return -1;
735 }
736
737
738 static inline void free_linux_governor(void)
739 {
740     palacios_free(get_cpu_var(core_state).linux_governor);
741     put_cpu_var(core_state);
742 }
743
744
745 static int linux_setup_palacios_governor(void)
746 {
747     char * gov;
748     unsigned int cpu = get_cpu();
749
750     /* KCH:  we assume the v3vee governor is already 
751      * registered with kernel by this point 
752      */
753
754     if (get_current_governor(&gov, cpu) < 0) {
755         ERROR("Could not get current governor\n");
756         return -1;
757     }
758
759     DEBUG("saving current governor (%s)\n", gov);
760
761     get_cpu_var(core_state).linux_governor = gov;
762     put_cpu_var(core_state);
763     
764     DEBUG("setting the new governor (%s)\n", PALACIOS_GOVNAME);
765
766     /* set the new one to ours */
767     if (governor_switch(PALACIOS_GOVNAME, cpu) < 0) {
768         ERROR("Could not set governor to (%s)\n", PALACIOS_GOVNAME);
769         return -1;
770     }
771
772     return 0;
773 }
774
775
776 #if 0
777 static int linux_deinit(void)
778 {
779     return 0;
780 }
781 #endif
782
783
784 static int linux_get_pstate(void)
785 {
786     struct cpufreq_policy * policy = NULL;
787     struct cpufreq_frequency_table *table;
788     int cpu = get_cpu();
789     unsigned int i = 0;
790     unsigned int count = 0;
791
792     policy = palacios_alloc(sizeof(struct cpufreq_policy));
793     if (!policy) {
794         ERROR("Could not allocate policy struct\n");
795         return -1;
796     }
797
798     cpufreq_get_policy(policy, cpu);
799     table = cpufreq_frequency_get_table(cpu);
800
801     for (i = 0; (table[i].frequency != CPUFREQ_TABLE_END); i++) {
802
803         if (table[i].frequency == CPUFREQ_ENTRY_INVALID) {
804             continue;
805         }
806
807         if (table[i].frequency == policy->cur) {
808             break;
809         }
810
811         count++;
812     }
813
814     palacios_free(policy);
815     return count;
816 }
817
818
819 static int linux_get_freq(void)
820 {
821     struct cpufreq_policy * policy = NULL;
822     int cpu = get_cpu();
823
824     policy = palacios_alloc(sizeof(struct cpufreq_policy));
825     if (!policy) {
826         ERROR("Could not allocate policy struct\n");
827         return -1;
828     }
829
830     if (cpufreq_get_policy(policy, cpu)) {
831         ERROR("Could not get current policy\n");
832         return -1;
833     }
834
835     return policy->cur;
836 }
837
838
839 static int linux_set_pstate(uint8_t p)
840 {
841     struct cpufreq_policy * policy = NULL;
842     struct cpufreq_frequency_table *table;
843     int cpu = get_cpu();
844     unsigned int i = 0;
845     unsigned int count = 0;
846     int state_set = 0;
847     int last_valid = 0;
848
849     policy = palacios_alloc(sizeof(struct cpufreq_policy));
850     if (!policy) {
851         ERROR("Could not allocate policy struct\n");
852         return -1;
853     }
854
855     if (cpufreq_get_policy(policy, cpu)) {
856         ERROR("Could not get current policy\n");
857         goto out_err;
858     }
859     table = cpufreq_frequency_get_table(cpu);
860
861     for (i = 0; (table[i].frequency != CPUFREQ_TABLE_END); i++) {
862
863         if (table[i].frequency == CPUFREQ_ENTRY_INVALID) {
864             continue;
865         }
866
867         if (count == p) {
868             cpufreq_driver_target(policy, table[i].frequency, CPUFREQ_RELATION_H);
869             state_set = 1;
870         }
871
872         count++;
873         last_valid = i;
874     }
875
876     /* we need to deal with the case in which we get a number > max pstate */
877     if (!state_set) {
878         cpufreq_driver_target(policy, table[last_valid].frequency, CPUFREQ_RELATION_H);
879     }
880
881     palacios_free(policy);
882     return 0;
883
884 out_err:
885     palacios_free(policy);
886     return -1;
887 }
888
889
890 static int linux_set_freq(uint64_t f)
891 {
892     struct cpufreq_policy * policy = NULL;
893     int cpu = get_cpu();
894     uint64_t freq;
895
896     policy = palacios_alloc(sizeof(struct cpufreq_policy));
897     if (!policy) {
898         ERROR("Could not allocate policy struct\n");
899         return -1;
900     }
901
902     cpufreq_get_policy(policy, cpu);
903
904     if (f < policy->min) {
905         freq = policy->min;
906     } else if (f > policy->max) {
907         freq = policy->max;
908     } else {
909         freq = f;
910     }
911
912     cpufreq_driver_target(policy, freq, CPUFREQ_RELATION_H);
913
914     palacios_free(policy);
915     return 0;
916 }
917
918
919 static int linux_restore_defaults(void)
920 {
921     unsigned int cpu = get_cpu();
922     char * gov = NULL;
923
924     gov = get_cpu_var(core_state).linux_governor;
925     put_cpu_var(core_state);
926
927     DEBUG("restoring previous governor (%s)\n", gov);
928
929     if (governor_switch(gov, cpu) < 0) {
930         ERROR("Could not restore governor to (%s)\n", gov);
931         goto out_err;
932     }
933
934     free_linux_governor();
935     return 0;
936
937 out_err:
938     free_linux_governor();
939     return -1;
940 }
941
942
943
944 /******************************************************************
945   Generic Interface as provided to Palacios and to the rest of the
946   module
947  ******************************************************************/
948
949 static void init_core(void)
950 {
951     unsigned cpu;
952     struct cpufreq_policy *p;
953
954
955     DEBUG("P-State Core Init\n");
956
957     get_cpu_var(core_state).mode = V3_PSTATE_HOST_CONTROL;
958     get_cpu_var(core_state).cur_pstate = 0;
959
960     if (machine_state.funcs) {
961         get_cpu_var(core_state).min_pstate = machine_state.funcs->get_min_pstate();
962         get_cpu_var(core_state).max_pstate = machine_state.funcs->get_max_pstate();
963     } else {
964         get_cpu_var(core_state).min_pstate = 0;
965         get_cpu_var(core_state).max_pstate = 0;
966     }
967
968
969     cpu = get_cpu(); put_cpu();
970
971     p = cpufreq_cpu_get(cpu);
972
973     if (!p) { 
974         get_cpu_var(core_state).have_cpufreq = 0;
975         get_cpu_var(core_state).min_freq_khz=0;
976         get_cpu_var(core_state).max_freq_khz=0;
977         get_cpu_var(core_state).cur_freq_khz=0;
978     } else {
979         get_cpu_var(core_state).have_cpufreq = 1;
980         get_cpu_var(core_state).min_freq_khz=p->min;
981         get_cpu_var(core_state).max_freq_khz=p->max;
982         get_cpu_var(core_state).cur_freq_khz=p->cur;
983         cpufreq_cpu_put(p);
984     }
985
986     put_cpu_var(core_state);
987
988 }
989
990
991 void palacios_pstate_ctrl_release(void);
992
993
994 static void deinit_core(void)
995 {
996     int cpu;
997     DEBUG("P-State Core Deinit\n");
998     cpu = get_cpu();
999     palacios_pstate_ctrl_release();
1000 }
1001
1002
1003
1004 void palacios_pstate_ctrl_get_chars(struct v3_cpu_pstate_chars *c) 
1005 {
1006     memset(c,0,sizeof(struct v3_cpu_pstate_chars));
1007
1008
1009     c->features = V3_PSTATE_INTERNAL_CONTROL;
1010
1011     if (get_cpu_var(core_state).have_cpufreq) {
1012         c->features |= V3_PSTATE_EXTERNAL_CONTROL;
1013     }
1014
1015     if (machine_state.arch==AMD || machine_state.arch==INTEL) { 
1016         c->features |= V3_PSTATE_DIRECT_CONTROL;
1017     }
1018     c->cur_mode = get_cpu_var(core_state).mode;
1019     c->min_pstate = get_cpu_var(core_state).min_pstate;
1020     c->max_pstate = get_cpu_var(core_state).max_pstate;
1021     c->cur_pstate = get_cpu_var(core_state).cur_pstate;
1022     c->min_freq_khz = get_cpu_var(core_state).min_freq_khz;
1023     c->max_freq_khz = get_cpu_var(core_state).max_freq_khz;
1024     c->cur_freq_khz = get_cpu_var(core_state).cur_freq_khz;
1025
1026     put_cpu_var(core_state);
1027
1028
1029
1030 }
1031
1032
1033 uint8_t palacios_pstate_ctrl_get_pstate(void)
1034 {
1035     if (get_cpu_var(core_state).mode==V3_PSTATE_DIRECT_CONTROL) { 
1036         put_cpu_var(core_state);
1037         return machine_state.funcs->get_pstate();
1038     } else if (get_cpu_var(core_state).mode==V3_PSTATE_EXTERNAL_CONTROL) {
1039         put_cpu_var(core_state);
1040         return linux_get_pstate();
1041     } else {
1042         put_cpu_var(core_state);
1043         return 0;
1044     }
1045 }
1046
1047
1048 void palacios_pstate_ctrl_set_pstate(uint8_t p)
1049 {
1050     if (get_cpu_var(core_state).mode==V3_PSTATE_DIRECT_CONTROL) { 
1051         put_cpu_var(core_state);
1052         machine_state.funcs->set_pstate(p);
1053     } else if (get_cpu_var(core_state).mode==V3_PSTATE_EXTERNAL_CONTROL) {
1054         put_cpu_var(core_state);
1055         linux_set_pstate(p);
1056     } 
1057 }
1058
1059
1060 void palacios_pstate_ctrl_set_pstate_wrapper(void *p)
1061 {
1062     palacios_pstate_ctrl_set_pstate((uint8_t)(uint64_t)p);
1063 }
1064
1065
1066 uint64_t palacios_pstate_ctrl_get_freq(void)
1067 {
1068     if (get_cpu_var(core_state).mode==V3_PSTATE_EXTERNAL_CONTROL) { 
1069         put_cpu_var(core_state);
1070         return linux_get_freq();
1071     } else {
1072         put_cpu_var(core_state);
1073         return 0;
1074     }
1075 }
1076
1077
1078 void palacios_pstate_ctrl_set_freq(uint64_t p)
1079 {
1080     if (get_cpu_var(core_state).mode==V3_PSTATE_EXTERNAL_CONTROL) { 
1081         put_cpu_var(core_state);
1082         linux_set_freq(p);
1083     } 
1084     put_cpu_var(core_state);
1085 }
1086
1087
1088 static int switch_to_external(void)
1089 {
1090     if (!(get_cpu_var(core_state).have_cpufreq)) {
1091         put_cpu_var(core_state);
1092         ERROR("No cpufreq  - cannot switch to external...\n");
1093         return -1;
1094     }
1095     put_cpu_var(core_state);
1096
1097     DEBUG("Switching to external control\n");
1098     return linux_restore_defaults();
1099 }
1100
1101
1102 static int switch_to_direct(void)
1103 {
1104     if (get_cpu_var(core_state).have_cpufreq) { 
1105         put_cpu_var(core_state);
1106         DEBUG("switch to direct from cpufreq\n");
1107
1108         // The implementation would set the policy and governor to peg cpu
1109         // regardless of load
1110         linux_setup_palacios_governor();
1111     }
1112
1113     if (machine_state.funcs && machine_state.funcs->arch_init) {
1114         get_cpu_var(core_state).mode=V3_PSTATE_DIRECT_CONTROL;
1115
1116         machine_state.funcs->arch_init();
1117
1118         put_cpu_var(core_state);
1119     }
1120
1121     return 0;
1122 }
1123
1124
1125 static int switch_to_internal(void)
1126 {
1127     if (get_cpu_var(core_state).have_cpufreq) { 
1128         put_cpu_var(core_state);
1129         DEBUG("switch to internal on machine with cpu freq\n");
1130         linux_setup_palacios_governor();
1131     }
1132
1133     get_cpu_var(core_state).mode=V3_PSTATE_INTERNAL_CONTROL;
1134
1135     put_cpu_var(core_state);
1136
1137     return 0;
1138 }
1139
1140
1141 static int switch_from_external(void)
1142 {
1143     if (!(get_cpu_var(core_state).have_cpufreq)) {
1144         put_cpu_var(core_state);
1145         ERROR("No cpufreq  - how did we get here... external...\n");
1146         return -1;
1147     }
1148
1149     DEBUG("Switching from external...\n");
1150     linux_restore_defaults();
1151
1152     get_cpu_var(core_state).mode = V3_PSTATE_HOST_CONTROL;
1153
1154     put_cpu_var(core_state);
1155
1156     return 0;
1157 }
1158
1159
1160 static int switch_from_direct(void)
1161 {
1162     if (get_cpu_var(core_state).have_cpufreq) { 
1163         put_cpu_var(core_state);
1164         DEBUG("Switching back to cpufreq control from direct\n");
1165         linux_restore_defaults();
1166     }
1167
1168     get_cpu_var(core_state).mode=V3_PSTATE_HOST_CONTROL;
1169
1170     machine_state.funcs->set_pstate(get_cpu_var(core_state).min_pstate);
1171
1172     machine_state.funcs->arch_deinit();
1173
1174     put_cpu_var(core_state);
1175
1176     return 0;
1177 }
1178
1179
1180 static int switch_from_internal(void)
1181 {
1182     if (get_cpu_var(core_state).have_cpufreq) { 
1183         put_cpu_var(core_state);
1184         ERROR("Unimplemented: switch from internal on machine with cpu freq - will just pretend to do so\n");
1185         // The implementation would switch back to default policy and governor
1186         linux_restore_defaults();
1187     }
1188
1189     get_cpu_var(core_state).mode=V3_PSTATE_HOST_CONTROL;
1190
1191     put_cpu_var(core_state);
1192
1193     return 0;
1194 }
1195
1196
1197
1198 void palacios_pstate_ctrl_acquire(uint32_t type)
1199 {
1200     if (get_cpu_var(core_state).mode != V3_PSTATE_HOST_CONTROL) { 
1201         palacios_pstate_ctrl_release();
1202     }
1203
1204     put_cpu_var(core_state);
1205
1206     switch (type) { 
1207         case V3_PSTATE_EXTERNAL_CONTROL:
1208             switch_to_external();
1209             break;
1210         case V3_PSTATE_DIRECT_CONTROL:
1211             switch_to_direct();
1212             break;
1213         case V3_PSTATE_INTERNAL_CONTROL:
1214             switch_to_internal();
1215             break;
1216         default:
1217             ERROR("Unknown pstate control type %u\n",type);
1218             break;
1219     }
1220
1221 }
1222
1223 // Wrappers for xcalls
1224 static void palacios_pstate_ctrl_acquire_external(void)
1225 {
1226     palacios_pstate_ctrl_acquire(V3_PSTATE_EXTERNAL_CONTROL);
1227 }
1228
1229 static void palacios_pstate_ctrl_acquire_direct(void)
1230 {
1231     palacios_pstate_ctrl_acquire(V3_PSTATE_DIRECT_CONTROL);
1232 }
1233
1234
1235 void palacios_pstate_ctrl_release(void)
1236 {
1237     if (get_cpu_var(core_state).mode == V3_PSTATE_HOST_CONTROL) { 
1238         put_cpu_var(core_state);
1239         return;
1240     }
1241
1242     switch (get_cpu_var(core_state).mode) { 
1243         case V3_PSTATE_EXTERNAL_CONTROL:
1244             switch_from_external();
1245             break;
1246         case V3_PSTATE_DIRECT_CONTROL:
1247             switch_from_direct();
1248             break;
1249         case V3_PSTATE_INTERNAL_CONTROL:
1250             switch_from_internal();
1251             break;
1252         default:
1253             ERROR("Unknown pstate control type %u\n",core_state.mode);
1254             break;
1255     }
1256
1257     put_cpu_var(core_state);
1258
1259 }
1260
1261
1262 static void update_hw_pstate(void *arg)
1263 {
1264     if (machine_state.funcs && machine_state.funcs->get_pstate) {
1265         get_cpu_var(core_state).cur_hw_pstate = machine_state.funcs->get_pstate();
1266         put_cpu_var(core_state);
1267     } else {
1268         get_cpu_var(core_state).cur_hw_pstate = 0;
1269         put_cpu_var(core_state);
1270     }
1271 }
1272
1273
1274 /***************************************************************************
1275   PROC Interface to expose state
1276  ***************************************************************************/
1277
1278 static int pstate_show(struct seq_file * file, void * v)
1279 {
1280     unsigned int cpu;
1281     unsigned int numcpus = num_online_cpus();
1282
1283     seq_printf(file, "V3VEE DVFS Status\n\n");
1284
1285     for (cpu=0;cpu<numcpus;cpu++) { 
1286         palacios_xcall(cpu,update_hw_pstate,0);
1287     }
1288
1289     seq_printf(file, "Arch:\t%s\nPStates:\t%s\n\n",
1290             machine_state.arch==INTEL ? "Intel" : 
1291             machine_state.arch==AMD ? "AMD" : "Other",
1292             machine_state.supports_pstates ? "Yes" : "No");
1293
1294     for (cpu=0;cpu<numcpus;cpu++) { 
1295         struct pstate_core_info *s = &per_cpu(core_state,cpu);
1296         seq_printf(file,"pcore %u: hw pstate %u mode %s of [ host ",cpu,
1297                 s->cur_hw_pstate,
1298                 s->mode==V3_PSTATE_HOST_CONTROL ? "host" :
1299                 s->mode==V3_PSTATE_EXTERNAL_CONTROL ? "external" :
1300                 s->mode==V3_PSTATE_DIRECT_CONTROL ? "direct" : 
1301                 s->mode==V3_PSTATE_INTERNAL_CONTROL ? "internal" : "UNKNOWN");
1302         if (s->have_cpufreq) { 
1303             seq_printf(file,"external ");
1304         }
1305         if (machine_state.supports_pstates) {
1306             seq_printf(file,"direct ");
1307         }
1308         seq_printf(file,"internal ] ");
1309         if (s->mode==V3_PSTATE_EXTERNAL_CONTROL) { 
1310             seq_printf(file,"(min=%llu max=%llu cur=%llu) ", s->min_freq_khz, s->max_freq_khz, s->cur_freq_khz);
1311         } 
1312         if (s->mode==V3_PSTATE_DIRECT_CONTROL) { 
1313             seq_printf(file,"(min=%u max=%u cur=%u) ", (uint32_t)s->min_pstate, (uint32_t)s->max_pstate, (uint32_t)s->cur_pstate);
1314         }
1315         seq_printf(file,"\n");
1316     }
1317     return 0;
1318 }
1319
1320 static int pstate_open(struct inode * inode, struct file * file) 
1321 {
1322     return single_open(file, pstate_show, NULL);
1323 }
1324
1325
1326 static struct file_operations pstate_fops = {
1327     .owner = THIS_MODULE,
1328     .open = pstate_open, 
1329     .read = seq_read,
1330     .llseek = seq_lseek,
1331     .release = seq_release
1332 };
1333
1334 int pstate_proc_setup(void)
1335 {
1336     struct proc_dir_entry *proc;
1337
1338     proc = create_proc_entry("v3-dvfs",0444, palacios_get_procdir());
1339
1340     if (!proc) { 
1341         ERROR("Failed to create proc entry for p-state control\n");
1342         return -1;
1343     }
1344
1345     proc->proc_fops = &pstate_fops;
1346
1347     return 0;
1348 }
1349
1350 void pstate_proc_teardown(void)
1351 {
1352     remove_proc_entry("v3-dvfs",palacios_get_procdir());
1353 }
1354
1355 /********************************************************************
1356   User interface (ioctls)
1357  ********************************************************************/
1358
1359 static int dvfs_ctrl(unsigned int cmd, unsigned long arg) 
1360 {
1361     struct v3_dvfs_ctrl_request r;
1362
1363     if (copy_from_user(&r,(void __user*)arg,sizeof(struct v3_dvfs_ctrl_request))) {
1364         ERROR("Failed to copy DVFS request from user\n");
1365         return -EFAULT;
1366     }
1367
1368     if (r.pcore >= num_online_cpus()) {
1369         ERROR("Cannot apply DVFS request to pcore %u\n",r.pcore);
1370         return -EFAULT;
1371     }
1372
1373     switch (r.cmd) {
1374         case V3_DVFS_ACQUIRE: {
1375                                   switch (r.acq_type) { 
1376                                       case V3_DVFS_EXTERNAL:
1377                                           palacios_xcall(r.pcore,(void (*)(void*))palacios_pstate_ctrl_acquire_external, NULL);
1378                                           return 0;
1379                                           break;
1380                                       case V3_DVFS_DIRECT:
1381                                           palacios_xcall(r.pcore,(void (*)(void*))palacios_pstate_ctrl_acquire_direct, NULL);
1382                                           return 0;
1383                                           break;
1384                                       default:
1385                                           ERROR("Unknown DVFS acquire type %u\n",r.acq_type);
1386                                           return -EFAULT;
1387                                   }
1388                               }
1389                               break;
1390         case V3_DVFS_RELEASE: {
1391                                   palacios_xcall(r.pcore,(void (*)(void*))palacios_pstate_ctrl_release, NULL);
1392                                   return 0;
1393                               }
1394                               break;
1395         case V3_DVFS_SETFREQ: {
1396                                   palacios_xcall(r.pcore,(void (*)(void*))palacios_pstate_ctrl_set_freq,(void*)r.freq_khz);
1397                                   return 0;
1398                               }
1399                               break;
1400         case V3_DVFS_SETPSTATE: {
1401                                     palacios_xcall(r.pcore,(void (*)(void*))palacios_pstate_ctrl_set_pstate_wrapper,(void*)(uint64_t)r.pstate);
1402                                     return 0;
1403                                 }
1404         default: {
1405                      ERROR("Unknown DVFS command %u\n",r.cmd);
1406                      return -EFAULT;
1407                  }
1408                  break;
1409     }
1410 }
1411
1412
1413 void pstate_user_setup(void)
1414 {
1415     add_global_ctrl(V3_DVFS_CTRL, dvfs_ctrl);
1416 }
1417
1418
1419 void pstate_user_teardown(void)
1420 {
1421     remove_global_ctrl(V3_DVFS_CTRL);
1422 }
1423
1424 static struct v3_host_pstate_ctrl_iface hooks = {
1425     .get_chars = palacios_pstate_ctrl_get_chars,
1426     .acquire = palacios_pstate_ctrl_acquire,
1427     .release = palacios_pstate_ctrl_release,
1428     .set_pstate = palacios_pstate_ctrl_set_pstate,
1429     .get_pstate = palacios_pstate_ctrl_get_pstate,
1430     .set_freq = palacios_pstate_ctrl_set_freq,
1431     .get_freq = palacios_pstate_ctrl_get_freq,
1432 };
1433
1434
1435
1436 static int pstate_ctrl_init(void) 
1437 {
1438     unsigned int cpu;
1439     unsigned int numcpus = num_online_cpus();
1440
1441     pstate_arch_setup();
1442
1443     for (cpu=0;cpu<numcpus;cpu++) { 
1444         palacios_xcall(cpu,(void ((*)(void*)))init_core,0);
1445     }
1446
1447     V3_Init_Pstate_Ctrl(&hooks);  
1448
1449     if (pstate_proc_setup()) { 
1450         ERROR("Unable to initialize P-State Control\n");
1451         return -1;
1452     }
1453
1454     pstate_user_setup();
1455
1456     pstate_register_linux_governor();
1457
1458     INFO("P-State Control Initialized\n");
1459
1460     return 0;
1461 }
1462
1463 static int pstate_ctrl_deinit(void)
1464 {
1465     unsigned int cpu;
1466     unsigned int numcpus=num_online_cpus();
1467
1468     pstate_unregister_linux_governor();
1469
1470     pstate_user_teardown();
1471
1472     pstate_proc_teardown();
1473
1474     // release pstate control if we have it, and we need to do this on each processor
1475     for (cpu=0;cpu<numcpus;cpu++) { 
1476         palacios_xcall(cpu,(void (*)(void *))deinit_core,0);
1477     }
1478
1479     return 0;
1480 }
1481
1482
1483 static struct linux_ext pstate_ext = {
1484     .name = "PSTATE_CTRL",
1485     .init = pstate_ctrl_init,
1486     .deinit = pstate_ctrl_deinit,
1487     .guest_init = NULL,
1488     .guest_deinit = NULL,
1489 };
1490
1491
1492 register_extension(&pstate_ext);
1493
1494
1495