2 * This file is part of the Palacios Virtual Machine Monitor developed
3 * by the V3VEE Project with funding from the United States National
4 * Science Foundation and the Department of Energy.
6 * The V3VEE Project is a joint project between Northwestern University
7 * and the University of New Mexico. You can find out more at
10 * Copyright (c) 2008, Jack Lange <jarusl@cs.northwestern.edu>
11 * Copyright (c) 2008, The V3VEE Project <http://www.v3vee.org>
12 * All rights reserved.
14 * Author: Jack Lange <jarusl@cs.northwestern.edu>
16 * This is free software. You are permitted to use,
17 * redistribute, and modify it as specified in the file "V3VEE_LICENSE".
20 #include <palacios/vmm_mem.h>
21 #include <palacios/vmm.h>
22 #include <palacios/vmcb.h>
23 #include <palacios/vmm_decoder.h>
24 #include <palacios/vm_guest_mem.h>
25 #include <palacios/vmm_ctrl_regs.h>
28 #ifndef DEBUG_CTRL_REGS
30 #define PrintDebug(fmt, args...)
34 static int handle_lmsw(struct guest_info * info, struct x86_instr * dec_instr);
35 static int handle_clts(struct guest_info * info, struct x86_instr * dec_instr);
36 static int handle_mov_to_cr0(struct guest_info * info, struct x86_instr * dec_instr);
39 // First Attempt = 494 lines
40 // current = 106 lines
41 int v3_handle_cr0_write(struct guest_info * info) {
44 struct x86_instr dec_instr;
46 if (info->mem_mode == PHYSICAL_MEM) {
47 ret = read_guest_pa_memory(info, get_addr_linear(info, info->rip, &(info->segments.cs)), 15, instr);
49 ret = read_guest_va_memory(info, get_addr_linear(info, info->rip, &(info->segments.cs)), 15, instr);
52 if (v3_decode(info, (addr_t)instr, &dec_instr) == -1) {
53 PrintError("Could not decode instruction\n");
58 if (dec_instr.op_type == V3_OP_LMSW) {
59 if (handle_lmsw(info, &dec_instr) == -1) {
62 } else if (dec_instr.op_type == V3_OP_MOV2CR) {
63 if (handle_mov_to_cr0(info, &dec_instr) == -1) {
66 } else if (dec_instr.op_type == V3_OP_CLTS) {
67 if (handle_clts(info, &dec_instr) == -1) {
71 PrintError("Unhandled opcode in handle_cr0_write\n");
75 info->rip += dec_instr.instr_length;
83 // The CR0 register only has flags in the low 32 bits
84 // The hardware does a format check to make sure the high bits are zero
85 // Because of this we can ignore the high 32 bits here
86 static int handle_mov_to_cr0(struct guest_info * info, struct x86_instr * dec_instr) {
88 struct cr0_32 * shadow_cr0 = (struct cr0_32 *)&(info->ctrl_regs.cr0);
89 struct cr0_32 * new_cr0 = (struct cr0_32 *)(dec_instr->src_operand.operand);
90 struct cr0_32 * guest_cr0 = (struct cr0_32 *)&(info->shdw_pg_state.guest_cr0);
91 uint_t paging_transition = 0;
93 PrintDebug("MOV2CR0 (MODE=%s)\n", v3_cpu_mode_to_str(info->cpu_mode));
95 PrintDebug("OperandVal = %x, length=%d\n", *(uint_t *)new_cr0, dec_instr->src_operand.size);
97 PrintDebug("Old CR0=%x\n", *(uint_t *)shadow_cr0);
98 PrintDebug("Old Guest CR0=%x\n", *(uint_t *)guest_cr0);
101 // We detect if this is a paging transition
102 if (guest_cr0->pg != new_cr0->pg) {
103 paging_transition = 1;
106 // Guest always sees the value they wrote
107 *guest_cr0 = *new_cr0;
109 // This value must always be set to 1
112 // Set the shadow register to catch non-virtualized flags
113 *shadow_cr0 = *guest_cr0;
115 // Paging is always enabled
118 // Was there a paging transition
119 // Meaning we need to change the page tables
120 if (paging_transition) {
121 if (v3_get_mem_mode(info) == VIRTUAL_MEM) {
123 struct efer_64 * guest_efer = (struct efer_64 *)&(info->guest_efer);
124 struct efer_64 * shadow_efer = (struct efer_64 *)&(info->ctrl_regs.efer);
126 // Check long mode LME to set LME
127 if (guest_efer->lme == 1) {
128 PrintDebug("Enabing Long Mode\n");
131 shadow_efer->lma = 1;
132 shadow_efer->lme = 1;
134 PrintDebug("New EFER %p\n", (void *)*(addr_t *)(shadow_efer));
137 PrintDebug("Activating Shadow Page Tables\n");
139 if (v3_activate_shadow_pt(info) == -1) {
140 PrintError("Failed to activate shadow page tables\n");
145 if (v3_activate_passthrough_pt(info) == -1) {
146 PrintError("Failed to activate passthrough page tables\n");
153 PrintDebug("New Guest CR0=%x\n",*(uint_t *)guest_cr0);
154 PrintDebug("New CR0=%x\n", *(uint_t *)shadow_cr0);
162 static int handle_clts(struct guest_info * info, struct x86_instr * dec_instr) {
164 struct cr0_32 * real_cr0 = (struct cr0_32*)&(info->ctrl_regs.cr0);
168 if (info->shdw_pg_mode == SHADOW_PAGING) {
169 struct cr0_32 * guest_cr0 = (struct cr0_32 *)&(info->shdw_pg_state.guest_cr0);
176 static int handle_lmsw(struct guest_info * info, struct x86_instr * dec_instr) {
177 struct cr0_real * real_cr0 = (struct cr0_real *)&(info->ctrl_regs.cr0);
178 // XED is a mess, and basically reverses the operand order for an LMSW
179 struct cr0_real * new_cr0 = (struct cr0_real *)(dec_instr->dst_operand.operand);
182 PrintDebug("LMSW\n");
184 new_cr0_val = (*(char*)(new_cr0)) & 0x0f;
186 PrintDebug("OperandVal = %x\n", new_cr0_val);
188 // We can just copy the new value through
189 // we don't need to virtualize the lower 4 bits
190 PrintDebug("Old CR0=%x\n", *(uint_t *)real_cr0);
191 *(uchar_t*)real_cr0 &= 0xf0;
192 *(uchar_t*)real_cr0 |= new_cr0_val;
193 PrintDebug("New CR0=%x\n", *(uint_t *)real_cr0);
196 // If Shadow paging is enabled we push the changes to the virtualized copy of cr0
197 if (info->shdw_pg_mode == SHADOW_PAGING) {
198 struct cr0_real * guest_cr0 = (struct cr0_real*)&(info->shdw_pg_state.guest_cr0);
200 PrintDebug("Old Guest CR0=%x\n", *(uint_t *)guest_cr0);
201 *(uchar_t*)guest_cr0 &= 0xf0;
202 *(uchar_t*)guest_cr0 |= new_cr0_val;
203 PrintDebug("New Guest CR0=%x\n", *(uint_t *)guest_cr0);
212 // First attempt = 253 lines
213 // current = 51 lines
214 int v3_handle_cr0_read(struct guest_info * info) {
217 struct x86_instr dec_instr;
219 if (info->mem_mode == PHYSICAL_MEM) {
220 ret = read_guest_pa_memory(info, get_addr_linear(info, info->rip, &(info->segments.cs)), 15, instr);
222 ret = read_guest_va_memory(info, get_addr_linear(info, info->rip, &(info->segments.cs)), 15, instr);
226 if (v3_decode(info, (addr_t)instr, &dec_instr) == -1) {
227 PrintError("Could not decode instruction\n");
231 if (dec_instr.op_type == V3_OP_MOVCR2) {
232 struct cr0_32 * dst_reg = (struct cr0_32 *)(dec_instr.dst_operand.operand);
233 struct cr0_32 * shadow_cr0 = (struct cr0_32 *)&(info->ctrl_regs.cr0);
235 PrintDebug("MOVCR2 (mode=%s)\n", v3_cpu_mode_to_str(info->cpu_mode));
237 if (info->shdw_pg_mode == SHADOW_PAGING) {
238 struct cr0_32 * guest_cr0 = (struct cr0_32 *)&(info->shdw_pg_state.guest_cr0);
239 *dst_reg = *guest_cr0;
241 *dst_reg = *shadow_cr0;
244 PrintDebug("Shadow CR0: %x\n", *(uint_t*)shadow_cr0);
245 PrintDebug("returned CR0: %x\n", *(uint_t*)dst_reg);
246 } else if (dec_instr.op_type == V3_OP_SMSW) {
247 struct cr0_real * shadow_cr0 = (struct cr0_real *)&(info->ctrl_regs.cr0);
248 struct cr0_real * dst_reg = (struct cr0_real *)(dec_instr.dst_operand.operand);
249 char cr0_val = *(char*)shadow_cr0 & 0x0f;
251 PrintDebug("SMSW\n");
253 // The lower 4 bits of the guest/shadow CR0 are mapped through
254 // We can treat nested and shadow paging the same here
255 *(char *)dst_reg &= 0xf0;
256 *(char *)dst_reg |= cr0_val;
259 PrintError("Unhandled opcode in handle_cr0_read\n");
263 info->rip += dec_instr.instr_length;
271 // First Attempt = 256 lines
272 // current = 65 lines
273 int v3_handle_cr3_write(struct guest_info * info) {
276 struct x86_instr dec_instr;
278 if (info->mem_mode == PHYSICAL_MEM) {
279 ret = read_guest_pa_memory(info, get_addr_linear(info, info->rip, &(info->segments.cs)), 15, instr);
281 ret = read_guest_va_memory(info, get_addr_linear(info, info->rip, &(info->segments.cs)), 15, instr);
284 if (v3_decode(info, (addr_t)instr, &dec_instr) == -1) {
285 PrintError("Could not decode instruction\n");
289 if (dec_instr.op_type == V3_OP_MOV2CR) {
290 PrintDebug("MOV2CR3 (cpu_mode=%s)\n", v3_cpu_mode_to_str(info->cpu_mode));
292 if (info->shdw_pg_mode == SHADOW_PAGING) {
293 PrintDebug("Old Shadow CR3=%p; Old Guest CR3=%p\n",
294 (void *)(addr_t)(info->ctrl_regs.cr3),
295 (void*)(addr_t)(info->shdw_pg_state.guest_cr3));
298 // We update the guest CR3
299 if (info->cpu_mode == LONG) {
300 struct cr3_64 * new_cr3 = (struct cr3_64 *)(dec_instr.src_operand.operand);
301 struct cr3_64 * guest_cr3 = (struct cr3_64 *)&(info->shdw_pg_state.guest_cr3);
302 *guest_cr3 = *new_cr3;
304 struct cr3_32 * new_cr3 = (struct cr3_32 *)(dec_instr.src_operand.operand);
305 struct cr3_32 * guest_cr3 = (struct cr3_32 *)&(info->shdw_pg_state.guest_cr3);
306 *guest_cr3 = *new_cr3;
309 // If Paging is enabled in the guest then we need to change the shadow page tables
310 if (info->mem_mode == VIRTUAL_MEM) {
311 if (v3_activate_shadow_pt(info) == -1) {
312 PrintError("Failed to activate 32 bit shadow page table\n");
317 PrintDebug("New Shadow CR3=%p; New Guest CR3=%p\n",
318 (void *)(addr_t)(info->ctrl_regs.cr3),
319 (void*)(addr_t)(info->shdw_pg_state.guest_cr3));
321 } else if (info->shdw_pg_mode == NESTED_PAGING) {
323 // This is just a passthrough operation which we probably don't need here
324 if (info->cpu_mode == LONG) {
325 struct cr3_64 * new_cr3 = (struct cr3_64 *)(dec_instr.src_operand.operand);
326 struct cr3_64 * guest_cr3 = (struct cr3_64 *)&(info->ctrl_regs.cr3);
327 *guest_cr3 = *new_cr3;
329 struct cr3_32 * new_cr3 = (struct cr3_32 *)(dec_instr.src_operand.operand);
330 struct cr3_32 * guest_cr3 = (struct cr3_32 *)&(info->ctrl_regs.cr3);
331 *guest_cr3 = *new_cr3;
336 PrintError("Unhandled opcode in handle_cr3_write\n");
340 info->rip += dec_instr.instr_length;
347 // first attempt = 156 lines
348 // current = 36 lines
349 int v3_handle_cr3_read(struct guest_info * info) {
352 struct x86_instr dec_instr;
354 if (info->mem_mode == PHYSICAL_MEM) {
355 ret = read_guest_pa_memory(info, get_addr_linear(info, info->rip, &(info->segments.cs)), 15, instr);
357 ret = read_guest_va_memory(info, get_addr_linear(info, info->rip, &(info->segments.cs)), 15, instr);
360 if (v3_decode(info, (addr_t)instr, &dec_instr) == -1) {
361 PrintError("Could not decode instruction\n");
365 if (dec_instr.op_type == V3_OP_MOVCR2) {
366 PrintDebug("MOVCR32 (mode=%s)\n", v3_cpu_mode_to_str(info->cpu_mode));
368 if (info->shdw_pg_mode == SHADOW_PAGING) {
370 if ((v3_get_cpu_mode(info) == LONG) ||
371 (v3_get_cpu_mode(info) == LONG_32_COMPAT)) {
372 struct cr3_64 * dst_reg = (struct cr3_64 *)(dec_instr.dst_operand.operand);
373 struct cr3_64 * guest_cr3 = (struct cr3_64 *)&(info->shdw_pg_state.guest_cr3);
374 *dst_reg = *guest_cr3;
376 struct cr3_32 * dst_reg = (struct cr3_32 *)(dec_instr.dst_operand.operand);
377 struct cr3_32 * guest_cr3 = (struct cr3_32 *)&(info->shdw_pg_state.guest_cr3);
378 *dst_reg = *guest_cr3;
381 } else if (info->shdw_pg_mode == NESTED_PAGING) {
383 // This is just a passthrough operation which we probably don't need here
384 if ((v3_get_cpu_mode(info) == LONG) ||
385 (v3_get_cpu_mode(info) == LONG_32_COMPAT)) {
386 struct cr3_64 * dst_reg = (struct cr3_64 *)(dec_instr.dst_operand.operand);
387 struct cr3_64 * guest_cr3 = (struct cr3_64 *)&(info->ctrl_regs.cr3);
388 *dst_reg = *guest_cr3;
390 struct cr3_32 * dst_reg = (struct cr3_32 *)(dec_instr.dst_operand.operand);
391 struct cr3_32 * guest_cr3 = (struct cr3_32 *)&(info->ctrl_regs.cr3);
392 *dst_reg = *guest_cr3;
397 PrintError("Unhandled opcode in handle_cr3_read\n");
401 info->rip += dec_instr.instr_length;
407 // We don't need to virtualize CR4, all we need is to detect the activation of PAE
408 int v3_handle_cr4_read(struct guest_info * info) {
409 // PrintError("CR4 Read not handled\n");
414 int v3_handle_cr4_write(struct guest_info * info) {
418 struct x86_instr dec_instr;
419 v3_vm_cpu_mode_t cpu_mode = v3_get_cpu_mode(info);
421 if (info->mem_mode == PHYSICAL_MEM) {
422 ret = read_guest_pa_memory(info, get_addr_linear(info, info->rip, &(info->segments.cs)), 15, instr);
424 ret = read_guest_va_memory(info, get_addr_linear(info, info->rip, &(info->segments.cs)), 15, instr);
427 if (v3_decode(info, (addr_t)instr, &dec_instr) == -1) {
428 PrintError("Could not decode instruction\n");
432 if (dec_instr.op_type != V3_OP_MOV2CR) {
433 PrintError("Invalid opcode in write to CR4\n");
437 // Check to see if we need to flush the tlb
439 if (v3_get_mem_mode(info) == VIRTUAL_MEM) {
440 struct cr4_32 * new_cr4 = (struct cr4_32 *)(dec_instr.src_operand.operand);
441 struct cr4_32 * cr4 = (struct cr4_32 *)&(info->ctrl_regs.cr4);
443 // if pse, pge, or pae have changed while PG (in any mode) is on
444 // the side effect is a TLB flush, which means we need to
445 // toss the current shadow page tables too
448 // TODO - PAE FLAG needs to be special cased
449 if ((cr4->pse != new_cr4->pse) ||
450 (cr4->pge != new_cr4->pge) ||
451 (cr4->pae != new_cr4->pae)) {
452 PrintDebug("Handling PSE/PGE/PAE -> TLBFlush case, flag set\n");
459 if ((cpu_mode == PROTECTED) || (cpu_mode == PROTECTED_PAE)) {
460 struct cr4_32 * new_cr4 = (struct cr4_32 *)(dec_instr.src_operand.operand);
461 struct cr4_32 * cr4 = (struct cr4_32 *)&(info->ctrl_regs.cr4);
463 PrintDebug("OperandVal = %x, length = %d\n", *(uint_t *)new_cr4, dec_instr.src_operand.size);
464 PrintDebug("Old CR4=%x\n", *(uint_t *)cr4);
466 if ((info->shdw_pg_mode == SHADOW_PAGING)) {
467 if (v3_get_mem_mode(info) == PHYSICAL_MEM) {
469 if ((cr4->pae == 0) && (new_cr4->pae == 1)) {
470 PrintDebug("Creating PAE passthrough tables\n");
472 // Delete the old 32 bit direct map page tables
473 delete_page_tables_32((pde32_t *)V3_VAddr((void *)(info->direct_map_pt)));
475 // create 32 bit PAE direct map page table
476 info->direct_map_pt = (addr_t)V3_PAddr(create_passthrough_pts_32PAE(info));
478 // reset cr3 to new page tables
479 info->ctrl_regs.cr3 = *(addr_t*)&(info->direct_map_pt);
481 } else if ((cr4->pae == 1) && (new_cr4->pae == 0)) {
482 // Create passthrough standard 32bit pagetables
489 PrintDebug("New CR4=%x\n", *(uint_t *)cr4);
491 } else if ((cpu_mode == LONG) || (cpu_mode == LONG_32_COMPAT)) {
492 struct cr4_64 * new_cr4 = (struct cr4_64 *)(dec_instr.src_operand.operand);
493 struct cr4_64 * cr4 = (struct cr4_64 *)&(info->ctrl_regs.cr4);
495 PrintDebug("Old CR4=%p\n", (void *)*(addr_t *)cr4);
496 PrintDebug("New CR4=%p\n", (void *)*(addr_t *)new_cr4);
498 if (new_cr4->pae == 0) {
499 // cannot turn off PAE in long mode GPF the guest
500 PrintError("Cannot disable PAE in long mode, sending GPF\n");
507 PrintError("CR4 write not supported in CPU_MODE: %s\n", v3_cpu_mode_to_str(cpu_mode));
513 PrintDebug("Handling PSE/PGE/PAE -> TLBFlush (doing flush now!)\n");
514 if (v3_activate_shadow_pt(info) == -1) {
515 PrintError("Failed to activate shadow page tables when emulating TLB flush in handling cr4 write\n");
521 info->rip += dec_instr.instr_length;
526 int v3_handle_efer_read(uint_t msr, struct v3_msr * dst, void * priv_data) {
527 struct guest_info * info = (struct guest_info *)(priv_data);
528 PrintDebug("EFER Read HI=%x LO=%x\n", info->guest_efer.hi, info->guest_efer.lo);
530 dst->value = info->guest_efer.value;
532 info->rip += 2; // WRMSR/RDMSR are two byte operands
538 // TODO: this is a disaster we need to clean this up...
539 int v3_handle_efer_write(uint_t msr, struct v3_msr src, void * priv_data) {
540 struct guest_info * info = (struct guest_info *)(priv_data);
541 //struct efer_64 * new_efer = (struct efer_64 *)&(src.value);
542 struct efer_64 * shadow_efer = (struct efer_64 *)&(info->ctrl_regs.efer);
543 struct v3_msr * guest_efer = &(info->guest_efer);
545 PrintDebug("EFER Write\n");
546 PrintDebug("EFER Write Values: HI=%x LO=%x\n", src.hi, src.lo);
547 //PrintDebug("Old EFER=%p\n", (void *)*(addr_t*)(shadow_efer));
549 // We virtualize the guests efer to hide the SVME and LMA bits
550 guest_efer->value = src.value;
553 // Enable/Disable Syscall
554 shadow_efer->sce = src.value & 0x1;
557 // We have to handle long mode writes....
560 if ((info->shdw_pg_mode == SHADOW_PAGING) &&
561 (v3_get_mem_mode(info) == PHYSICAL_MEM)) {
563 if ((shadow_efer->lme == 0) && (new_efer->lme == 1)) {
564 PrintDebug("Transition to longmode\n");
565 PrintDebug("Creating Passthrough 64 bit page tables\n");
567 // Delete the old 32 bit direct map page tables
569 PrintDebug("Deleting old PAE Page tables\n");
570 PrintError("JRL BUG?: Will the old page tables always be in PAE format??\n");
571 delete_page_tables_32PAE((pdpe32pae_t *)V3_VAddr((void *)(info->direct_map_pt)));
573 // create 64 bit direct map page table
574 info->direct_map_pt = (addr_t)V3_PAddr(create_passthrough_pts_64(info));
576 // reset cr3 to new page tables
577 info->ctrl_regs.cr3 = *(addr_t*)&(info->direct_map_pt);
579 // We mark the Long Mode active because we have paging enabled
580 // We do this in new_efer because we copy the msr in full below
581 // new_efer->lma = 1;
583 } else if ((shadow_efer->lme == 1) && (new_efer->lme == 0)) {
584 // transition out of long mode
585 //((struct efer_64 *)&(info->guest_efer.value))->lme = 0;
586 //((struct efer_64 *)&(info->guest_efer.value))->lma = 0;
591 // accept all changes to the efer, but make sure that the SVME bit is set... (SVM specific)
592 *shadow_efer = *new_efer;
593 shadow_efer->svme = 1;
597 PrintDebug("New EFER=%p\n", (void *)*(addr_t *)(shadow_efer));
599 PrintError("Write to EFER in NESTED_PAGING or VIRTUAL_MEM mode not supported\n");
600 // Should probably just check for a long mode transition, and bomb out if it is
604 info->rip += 2; // WRMSR/RDMSR are two byte operands