static int palacios_file_mkdir(const char * pathname, unsigned short perms, int recurse) {
/* Welcome to the jungle... */
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,1,0)
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,41)
/* DO NOT REFERENCE THIS VARIABLE */
/* It only exists to provide version compatibility */
struct path tmp_path;
}
/* Before Linux 3.1 this was somewhat more difficult */
-#if LINUX_VERSION_CODE < KERNEL_VERSION(3,1,0)
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,41)
{
struct nameidata nd;
// I'm not 100% sure about the version here, but it was around this time that the API changed
-#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,35)
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,37)
ret = kern_path_parent(pathname, &nd);
#else
return 0;
}
-
static int host_dev_ioctl(struct inode *ip, struct file *fp, unsigned int val, unsigned long arg)
{
void __user *argp = (void __user *)arg;
}
-
-
+static long host_dev_compat_ioctl(struct file * filp, unsigned int ioctl, unsigned long arg)
+{
+ return host_dev_ioctl(NULL, filp, ioctl, arg);
+}
static struct file_operations host_dev_fops = {
.poll = host_dev_poll,
.release = host_dev_release,
- .ioctl = host_dev_ioctl,
+#ifdef HAVE_COMPAT_IOCTL
+ .compat_ioctl = host_dev_compat_ioctl,
+#else
+ .ioctl = host_dev_ioctl,
+#endif
};
static void inline
deinit_raw_interface(struct raw_interface * iface){
- struct v3_packet * recver_state;
+ struct v3_packet * recver_state, * tmp_state;
kthread_stop(iface->recv_thread);
sock_release(iface->raw_sock);
palacios_free_htable(iface->mac_to_recver, 0, 0);
- list_for_each_entry(recver_state, &(iface->brdcast_recvers), node) {
+ list_for_each_entry_safe(recver_state, tmp_state, &(iface->brdcast_recvers), node) {
kfree(recver_state);
}
}
}
static int packet_deinit( void ) {
- struct raw_interface * iface;
+ struct raw_interface * iface, * tmp;
- list_for_each_entry(iface, &(packet_state.open_interfaces), node) {
+ list_for_each_entry_safe(iface, tmp, &(packet_state.open_interfaces), node) {
deinit_raw_interface(iface);
kfree(iface);
}
}
}
+ /* printk("PALACIOS BAD: LARGE PAGE ALLOCATION FAILED\n"); */
+
return 0;
}
struct page * pgs = NULL;
int order = get_order(num_pages * PAGE_SIZE);
- pgs = alloc_pages(GFP_DMA, order);
+ pgs = alloc_pages(GFP_DMA32, order);
WARN(!pgs, "Could not allocate pages\n");
+
+ /* if (!pgs) { printk("PALACIOS BAD: SMALL PAGE ALLOCATION FAILED\n"); } */
/* printk("%llu pages (order=%d) aquired from alloc_pages\n",
num_pages, order); */
}
static void deinit_links_list(void){
- struct vnet_link * link;
+ struct vnet_link * link, * tmp_link;
- list_for_each_entry(link, &(vnet_brg_s.link_list), node) {
+ list_for_each_entry_safe(link, tmp_link, &(vnet_brg_s.link_list), node) {
_delete_link(link);
}
}
static void deinit_links_list(void){
- struct vnet_link_iter * link;
+ struct vnet_link_iter * link, * tmp_link;
- list_for_each_entry(link, &(vnet_ctrl_s.link_iter_list), node) {
+ list_for_each_entry_safe(link, tmp_link, &(vnet_ctrl_s.link_iter_list), node) {
delete_link(link);
}
}
static void deinit_routes_list(void){
- struct vnet_route_iter * route;
+ struct vnet_route_iter * route, * tmp_route;
- list_for_each_entry(route, &(vnet_ctrl_s.route_list), node) {
+ list_for_each_entry_safe(route, tmp_route, &(vnet_ctrl_s.route_list), node) {
delete_route(route);
}
}
v3_mem : v3_mem.c v3_ctrl.h
gcc -static v3_mem.c -o v3_mem
-v3_cons : v3_cons.c v3_cons_sc.c v3_ctrl.h
- gcc -static v3_cons.c -o v3_cons -lcurses
- gcc -static v3_cons_sc.c -o v3_cons_sc -lcurses
+v3_cons_s: v3_cons.c v3_cons_sc.c v3_ctrl.h
+ -gcc -static -DNCURSES_STATIC v3_cons.c -o v3_cons_s -lcurses # -ltinfo needed on fedora
+ -gcc -static -DNCURSES_STATIC v3_cons_sc.c -o v3_cons_sc_s -lcurses # -ltinfo needed on fedora
+
+v3_cons: v3_cons.c v3_cons_sc.c v3_ctrl.h
+ -gcc v3_cons.c -o v3_cons_s -lcurses
+ -gcc v3_cons_sc.c -o v3_cons_sc_s -lcurses
v3_stream : v3_stream.c v3_ctrl.h
gcc -static v3_stream.c -o v3_stream
void v3_telemetry_start_exit(struct guest_info * info);
void v3_telemetry_end_exit(struct guest_info * info, uint_t exit_code);
+void v3_print_core_telemetry(struct guest_info * core);
void v3_print_global_telemetry(struct v3_vm_info * vm);
-void v3_print_core_telemetry(struct guest_info * vm);
+void v3_print_telemetry(struct v3_vm_info * vm, struct guest_info * core);
void v3_add_telemetry_cb(struct v3_vm_info * vm,
// Installed Timers slaved off of the guest monotonic TSC
uint_t num_timers;
struct list_head timers;
+
+ // Installed timeout handlers, and the time (in monotonic guest time) of hte
+ // next timeout.
+ uint64_t next_timeout;
+ struct list_head timeout_hooks;
};
struct v3_timer_ops {
void * private_data;
struct v3_timer_ops * ops;
+ // Need to add accuracy/resolution fields later.
+
struct list_head timer_link;
};
+typedef void (*v3_timeout_callback_t)(struct guest_info * info, void * priv_data);
+struct v3_timeout_hook {
+ void * private_data;
+ v3_timeout_callback_t callback;
+
+ struct list_head hook_link;
+};
+
// Basic functions for handling passage of time in palacios
void v3_init_time_core(struct guest_info * core);
int v3_init_time_vm(struct v3_vm_info * vm);
int v3_adjust_time(struct guest_info * core);
int v3_offset_time(struct guest_info * core, sint64_t offset);
-// Basic functions for attaching timers to the passage of time
+// Basic functions for attaching timers to the passage of time - these timers
+// should eventually specify their accuracy and resolution.
struct v3_timer * v3_add_timer(struct guest_info * info, struct v3_timer_ops * ops, void * private_data);
int v3_remove_timer(struct guest_info * info, struct v3_timer * timer);
void v3_update_timers(struct guest_info * info);
+// Functions for handling one-shot timeouts in Palacios. Note that only one
+// timeout is every currently outstanding (the soonest scheduled one!), and that
+// all hooks are called on any timeout. If a hook gets called before the desired
+// timeout time, that hook should reschedule its own timeout if desired.
+struct v3_timeout_hook * v3_add_timeout_hook(struct guest_info * info, v3_timeout_callback_t callback, void * priv_data);
+int v3_remove_timeout_hook(struct guest_info * info, struct v3_timeout_hook * hook);
+int v3_schedule_timeout(struct guest_info * info, ullong_t cycles);
+int v3_check_timeout(struct guest_info * info);
+
// Functions to return the different notions of time in Palacios.
static inline uint64_t v3_get_host_time(struct vm_time *t) {
uint64_t tmp;
};
+struct vmcs_msr_save_area {
+ union {
+ struct vmcs_msr_entry guest_msrs[4];
+ struct {
+ struct vmcs_msr_entry guest_star;
+ struct vmcs_msr_entry guest_lstar;
+ struct vmcs_msr_entry guest_fmask;
+ struct vmcs_msr_entry guest_kern_gs;
+ } __attribute__((packed));
+ } __attribute__((packed));
+
+ union {
+ struct vmcs_msr_entry host_msrs[4];
+ struct {
+ struct vmcs_msr_entry host_star;
+ struct vmcs_msr_entry host_lstar;
+ struct vmcs_msr_entry host_fmask;
+ struct vmcs_msr_entry host_kern_gs;
+ } __attribute__((packed));
+ } __attribute__((packed));
+
+} __attribute__((packed));
+
struct vmx_data {
vmx_state_t state;
struct vmx_exception_bitmap excp_bmap;
- void * msr_area;
+ addr_t msr_area_paddr;
+ struct vmcs_msr_save_area * msr_area;
};
int v3_is_vmx_capable();
}
bars[0].type = PCI_BAR_IO;
- bars[0].default_base_port = NIC_REG_BASE_PORT;
+ bars[0].default_base_port = -1;
bars[0].num_ports = 256;
bars[0].io_read = ne2k_pci_read;
#define BUF_SIZE 1024
#define DEBUG_PORT1 0xc0c0
+#define HEARTBEAT_PORT 0x99
struct debug_state {
char debug_buf[BUF_SIZE];
return length;
}
+static int handle_hb_write(struct guest_info * core, ushort_t port, void * src, uint_t length, void * priv_data) {
+ uint32_t val = 0;
+
+ if (length == 1) {
+ val = *(uint8_t *)src;
+ } else if (length == 2) {
+ val = *(uint16_t *)src;
+ } else {
+ val = *(uint32_t *)src;
+ }
+
+ V3_Print("HEARTBEAT> %x (%d)\n", val, val);
+
+ return length;
+}
+
static int handle_hcall(struct guest_info * info, uint_t hcall_id, void * priv_data) {
struct debug_state * state = (struct debug_state *)priv_data;
return -1;
}
+
+ if (v3_dev_hook_io(dev, HEARTBEAT_PORT, NULL, &handle_hb_write) == -1) {
+ PrintError("error hooking OS heartbeat port\n");
+ v3_remove_device(dev);
+ return -1;
+ }
+
v3_register_hypercall(vm, OS_DEBUG_HCALL, handle_hcall, state);
state->debug_offset = 0;
}
bars[0].type = PCI_BAR_IO;
- bars[0].default_base_port = 0xc100;
+ bars[0].default_base_port = -1;
bars[0].num_ports = 0x100;
bars[0].io_read = rtl8139_ioport_read;
// Perform any additional yielding needed for time adjustment
v3_adjust_time(info);
+ // Check for timeout - since this calls generic hooks in devices
+ // that may do things like pause the VM, it cannot be with interrupts
+ // disabled.
+ v3_check_timeout(info);
+
// disable global interrupts for vm state transition
v3_clgi();
PrintDebug("IN of %d bytes on port %d (0x%x)\n", read_size, io_info->port, io_info->port);
if (hook == NULL) {
- PrintDebug("IN operation on unhooked IO port 0x%x\n", io_info->port);
+ PrintDebug("IN operation on unhooked IO port 0x%x - returning zero\n", io_info->port);
+ core->vm_regs.rax >>= 8*read_size;
+ core->vm_regs.rax <<= 8*read_size;
- /* What are the HW semantics for an IN on an invalid port?
- * Do we need to clear the register value or leave it untouched???
- */
} else {
if (hook->read(core, io_info->port, &(core->vm_regs.rax), read_size, hook->priv_data) != read_size) {
// not sure how we handle errors.....
}
if (hook == NULL) {
- PrintDebug("INS operation on unhooked IO port 0x%x\n", io_info->port);
- /* What are the HW semantics for an INS on an invalid port?
- * Do we need to clear the memory region or leave it untouched???
- */
+ PrintDebug("INS operation on unhooked IO port 0x%x - returning zeros\n", io_info->port);
+ memset((char*)host_addr,0,read_size);
+
} else {
if (hook->read(core, io_info->port, (char *)host_addr, read_size, hook->priv_data) != read_size) {
// not sure how we handle errors.....
PrintDebug("OUT of %d bytes on port %d (0x%x)\n", write_size, io_info->port, io_info->port);
if (hook == NULL) {
- PrintDebug("OUT operation on unhooked IO port 0x%x\n", io_info->port);
+ PrintDebug("OUT operation on unhooked IO port 0x%x - ignored\n", io_info->port);
} else {
if (hook->write(core, io_info->port, &(core->vm_regs.rax), write_size, hook->priv_data) != write_size) {
// not sure how we handle errors.....
}
if (hook == NULL) {
- PrintDebug("OUTS operation on unhooked IO port 0x%x\n", io_info->port);
+ PrintDebug("OUTS operation on unhooked IO port 0x%x - ignored\n", io_info->port);
} else {
if (hook->write(core, io_info->port, (char*)host_addr, write_size, hook->priv_data) != write_size) {
// not sure how we handle errors.....
// save STAR, LSTAR, FMASK, KERNEL_GS_BASE MSRs in MSR load/store area
+ {
+ struct vmx_data * vmx_state = (struct vmx_data *)info->vmm_data;
+ struct vmcs_msr_save_area * msr_entries = vmx_state->msr_area;
+ v3_get_msr(IA32_STAR_MSR, &(msr_entries->host_star.hi), &(msr_entries->host_star.lo));
+ v3_get_msr(IA32_LSTAR_MSR, &(msr_entries->host_lstar.hi), &(msr_entries->host_lstar.lo));
+ v3_get_msr(IA32_FMASK_MSR, &(msr_entries->host_fmask.hi), &(msr_entries->host_fmask.lo));
+ v3_get_msr(IA32_KERN_GS_BASE_MSR, &(msr_entries->host_kern_gs.hi), &(msr_entries->host_kern_gs.lo));
+ }
PrintDebug("run: core=%u, func=0x%p, arg=0x%p, name=%s\n",
core_idx, start_core, core, core->exec_name);
+ core->core_run_state = CORE_STOPPED; // core zero will turn itself on
core->pcpu_id = core_idx;
core->core_thread = V3_CREATE_THREAD_ON_CPU(core_idx, start_core, core, core->exec_name);
map->base_region.host_addr = (addr_t)V3_AllocPages(mem_pages);
#endif
+ // Clear the memory...
+ memset(V3_VAddr((void *)map->base_region.host_addr), 0, mem_pages * PAGE_SIZE_4KB);
+
+
map->base_region.flags.read = 1;
map->base_region.flags.write = 1;
map->base_region.flags.exec = 1;
int v3_msr_unhandled_read(struct guest_info * core, uint32_t msr, struct v3_msr * dst, void * priv_data) {
- V3_Print("Palacios: Unhandled MSR Read (MSR=0x%x)\n", msr);
+ V3_Print("Palacios: Unhandled MSR Read (MSR=0x%x) - returning zero\n", msr);
+ dst->lo=dst->hi=0;
+ // should produce GPF for unsupported msr
return 0;
}
int v3_msr_unhandled_write(struct guest_info * core, uint32_t msr, struct v3_msr src, void * priv_data) {
- V3_Print("Palacios: Unhandled MSR Write (MSR=0x%x)\n", msr);
+ V3_Print("Palacios: Unhandled MSR Write (MSR=0x%x) - ignored\n", msr);
+ // should produce GPF for unsupported msr
return 0;
}
// check if the exit count has expired
if ((telemetry->exit_cnt % telemetry->vm_telem->granularity) == 0) {
- v3_print_global_telemetry(info->vm_info);
- v3_print_core_telemetry(info);
+ v3_print_telemetry(info->vm_info, info);
}
}
}
-void v3_print_core_telemetry(struct guest_info * core ) {
+static void telemetry_header(struct v3_vm_info *vm, char *hdr_buf, int len)
+{
+ struct v3_telemetry_state * telemetry = &(vm->telemetry);
+ snprintf(hdr_buf, len, "telem.%d>", telemetry->invoke_cnt);
+}
+
+static void print_telemetry_start(struct v3_vm_info *vm, char *hdr_buf)
+{
+ struct v3_telemetry_state * telemetry = &(vm->telemetry);
+ uint64_t invoke_tsc = 0;
+ rdtscll(invoke_tsc);
+ V3_Print("%stelemetry window tsc cnt: %d\n", hdr_buf, (uint32_t)(invoke_tsc - telemetry->prev_tsc));
+ telemetry->prev_tsc = invoke_tsc;
+}
+
+static void print_telemetry_end(struct v3_vm_info *vm, char *hdr_buf)
+{
+ V3_Print("%s Telemetry done\n", hdr_buf);
+}
+
+static void print_core_telemetry(struct guest_info * core, char *hdr_buf)
+{
struct exit_event * evt = NULL;
struct rb_node * node = v3_rb_first(&(core->core_telem.exit_root));
-
+
V3_Print("Exit information for Core %d\n", core->vcpu_id);
if (!node) {
evt = rb_entry(node, struct exit_event, tree_node);
const char * code_str = vmexit_code_to_str(evt->exit_code);
- V3_Print("%s:%sCnt=%u,%sAvg. Time=%u\n",
- code_str,
+ V3_Print("%s%s:%sCnt=%u,%sAvg. Time=%u\n",
+ hdr_buf, code_str,
(strlen(code_str) > 13) ? "\t" : "\t\t",
evt->cnt,
(evt->cnt >= 100) ? "\t" : "\t\t",
return;
}
-void v3_print_global_telemetry(struct v3_vm_info * vm) {
+void v3_print_core_telemetry(struct guest_info * core ) {
+ struct v3_vm_info *vm = core->vm_info;
struct v3_telemetry_state * telemetry = &(vm->telemetry);
- uint64_t invoke_tsc = 0;
char hdr_buf[32];
+
+ telemetry_header(vm, hdr_buf, 32);
+ telemetry->invoke_cnt++; // XXX this increment isn't atomic and probably should be
- rdtscll(invoke_tsc);
-
- snprintf(hdr_buf, 32, "telem.%d>", telemetry->invoke_cnt++);
+ print_telemetry_start(vm, hdr_buf);
+ print_core_telemetry(core, hdr_buf);
+ print_telemetry_end(vm, hdr_buf);
- V3_Print("%stelemetry window tsc cnt: %d\n", hdr_buf, (uint32_t)(invoke_tsc - telemetry->prev_tsc));
+ return;
+}
+static void telemetry_callbacks(struct v3_vm_info * vm, char *hdr_buf)
+{
+ struct v3_telemetry_state * telemetry = &(vm->telemetry);
// Registered callbacks
{
struct telemetry_cb * cb = NULL;
cb->telemetry_fn(vm, cb->private_data, hdr_buf);
}
}
+}
- telemetry->prev_tsc = invoke_tsc;
+void v3_print_global_telemetry(struct v3_vm_info * vm) {
+ struct v3_telemetry_state * telemetry = &(vm->telemetry);
+ char hdr_buf[32];
- V3_Print("%s Telemetry done\n", hdr_buf);
+ telemetry_header(vm, hdr_buf, 32);
+ telemetry->invoke_cnt++; // XXX this increment isn't atomic and probably should be
+
+ print_telemetry_start( vm, hdr_buf );
+ telemetry_callbacks( vm, hdr_buf );
+ print_telemetry_end( vm, hdr_buf );
+}
+
+void v3_print_telemetry(struct v3_vm_info * vm, struct guest_info * core )
+{
+ struct v3_telemetry_state * telemetry = &(vm->telemetry);
+ char hdr_buf[32];
+
+ telemetry_header(vm, hdr_buf, 32);
+ telemetry->invoke_cnt++; // XXX this increment isn't atomic and probably should be
+
+ print_telemetry_start(vm, hdr_buf);
+ print_core_telemetry(core, hdr_buf);
+ telemetry_callbacks(vm, hdr_buf);
+ print_telemetry_end(vm, hdr_buf);
+
+ return;
}
* (1) Add support for temporarily skewing guest time off of where it should
* be to support slack simulation of guests. The idea is that simulators
* set this skew to be the difference between how much time passed for a
- * simulated feature and a real implementation of that feature, making
+ * simulated feature and a real implementation of that feature, making time
* pass at a different rate from real time on this core. The VMM will then
* attempt to move this skew back towards 0 subject to resolution/accuracy
* constraints from various system timers.
* The main effort in doing this will be to get accuracy/resolution
* information from each local timer and to use this to bound how much skew
* is removed on each exit.
+ *
+ * (2) Look more into sychronizing the offsets *across* virtual and physical
+ * cores so that multicore guests stay mostly in sync.
+ *
+ * (3) Look into using the AMD TSC multiplier feature and adding explicit time
+ * dilation support to time handling.
*/
}
}
+/* Handle TSC timeout hooks */
+struct v3_timeout_hook *
+v3_add_timeout_hook(struct guest_info * info, v3_timeout_callback_t callback,
+ void * priv_data) {
+ struct v3_timeout_hook * timeout = NULL;
+ timeout = (struct v3_timeout_hook *)V3_Malloc(sizeof(struct v3_timeout_hook));
+ V3_ASSERT(timeout != NULL);
+
+ timeout->callback = callback;
+ timeout->private_data = priv_data;
+
+ list_add(&(timeout->hook_link), &(info->time_state.timeout_hooks));
+ return timeout;
+}
+
+int
+v3_remove_timeout_hook(struct guest_info * info, struct v3_timeout_hook * hook) {
+ list_del(&(hook->hook_link));
+ V3_Free(hook);
+ return 0;
+}
+
+int v3_schedule_timeout(struct guest_info * info, ullong_t guest_timeout) {
+ struct vm_time *time_state = &info->time_state;
+ /* Note that virtualization architectures that support it (like newer
+ * VMX systems) will turn on an active preemption timeout if
+ * available to get this timeout as closely as possible. Other systems
+ * only catch it in the periodic interrupt and so are less precise */
+ if (guest_timeout < time_state->next_timeout) {
+ time_state->next_timeout = guest_timeout;
+ }
+ return 0;
+}
+
+int v3_check_timeout( struct guest_info * info ) {
+ struct vm_time *time_state = &info->time_state;
+ if (time_state->next_timeout <= v3_get_guest_time(time_state)) {
+ struct v3_timeout_hook * tmp_timeout;
+ time_state->next_timeout = (ullong_t)-1;
+ list_for_each_entry(tmp_timeout, &(time_state->timeout_hooks), hook_link) {
+ tmp_timeout->callback(info, tmp_timeout->private_data);
+ }
+ }
+ return 0;
+}
+
/*
* Handle full virtualization of the time stamp counter. As noted
* above, we don't store the actual value of the TSC, only the guest's
time_state->guest_host_offset = 0;
time_state->tsc_guest_offset = 0;
+ INIT_LIST_HEAD(&(time_state->timeout_hooks));
+ time_state->next_timeout = (ullong_t)-1;
+
INIT_LIST_HEAD(&(time_state->timers));
time_state->num_timers = 0;
vmx_ret |= check_vmcs_write(VMCS_CR4_MASK, CR4_VMXE | CR4_PAE);
+ // Setup Guests initial PAT field
+ vmx_ret |= check_vmcs_write(VMCS_GUEST_PAT, 0x0007040600070406LL);
+
/* Setup paging */
if (core->shdw_pg_mode == SHADOW_PAGING) {
PrintDebug("Creating initial shadow page table\n");
core->rip = 0xfff0;
core->vm_regs.rdx = 0x00000f00;
core->ctrl_regs.rflags = 0x00000002; // The reserved bit is always 1
- core->ctrl_regs.cr0 = 0x60010010; // Set the WP flag so the memory hooks work in real-mode
-
+ core->ctrl_regs.cr0 = 0x00000030;
+ core->ctrl_regs.cr4 = 0x00002010; // Enable VMX and PSE flag
+
core->segments.cs.selector = 0xf000;
core->segments.cs.limit = 0xffff;
core->segments.ldtr.selector = 0x0000;
core->segments.ldtr.limit = 0x0000ffff;
core->segments.ldtr.base = 0x0000000000000000LL;
- core->segments.ldtr.type = 2;
+ core->segments.ldtr.type = 0x2;
core->segments.ldtr.present = 1;
core->segments.tr.selector = 0x0000;
// save STAR, LSTAR, FMASK, KERNEL_GS_BASE MSRs in MSR load/store area
{
- int msr_ret = 0;
- struct vmcs_msr_entry * exit_store_msrs = NULL;
- struct vmcs_msr_entry * exit_load_msrs = NULL;
- struct vmcs_msr_entry * entry_load_msrs = NULL;;
+ struct vmcs_msr_save_area * msr_entries = NULL;
int max_msrs = (hw_info.misc_info.max_msr_cache_size + 1) * 4;
+ int msr_ret = 0;
V3_Print("Setting up MSR load/store areas (max_msr_count=%d)\n", max_msrs);
return -1;
}
- vmx_state->msr_area = V3_VAddr(V3_AllocPages(1));
-
- if (vmx_state->msr_area == NULL) {
+ vmx_state->msr_area_paddr = (addr_t)V3_AllocPages(1);
+
+ if (vmx_state->msr_area_paddr == (addr_t)NULL) {
PrintError("could not allocate msr load/store area\n");
return -1;
}
+ msr_entries = (struct vmcs_msr_save_area *)V3_VAddr((void *)(vmx_state->msr_area_paddr));
+ vmx_state->msr_area = msr_entries; // cache in vmx_info
+
+ memset(msr_entries, 0, PAGE_SIZE);
+
+ msr_entries->guest_star.index = IA32_STAR_MSR;
+ msr_entries->guest_lstar.index = IA32_LSTAR_MSR;
+ msr_entries->guest_fmask.index = IA32_FMASK_MSR;
+ msr_entries->guest_kern_gs.index = IA32_KERN_GS_BASE_MSR;
+
+ msr_entries->host_star.index = IA32_STAR_MSR;
+ msr_entries->host_lstar.index = IA32_LSTAR_MSR;
+ msr_entries->host_fmask.index = IA32_FMASK_MSR;
+ msr_entries->host_kern_gs.index = IA32_KERN_GS_BASE_MSR;
+
msr_ret |= check_vmcs_write(VMCS_EXIT_MSR_STORE_CNT, 4);
msr_ret |= check_vmcs_write(VMCS_EXIT_MSR_LOAD_CNT, 4);
msr_ret |= check_vmcs_write(VMCS_ENTRY_MSR_LOAD_CNT, 4);
-
-
- exit_store_msrs = (struct vmcs_msr_entry *)(vmx_state->msr_area);
- exit_load_msrs = (struct vmcs_msr_entry *)(vmx_state->msr_area + (sizeof(struct vmcs_msr_entry) * 4));
- entry_load_msrs = (struct vmcs_msr_entry *)(vmx_state->msr_area + (sizeof(struct vmcs_msr_entry) * 8));
+ msr_ret |= check_vmcs_write(VMCS_EXIT_MSR_STORE_ADDR, (addr_t)V3_PAddr(msr_entries->guest_msrs));
+ msr_ret |= check_vmcs_write(VMCS_ENTRY_MSR_LOAD_ADDR, (addr_t)V3_PAddr(msr_entries->guest_msrs));
+ msr_ret |= check_vmcs_write(VMCS_EXIT_MSR_LOAD_ADDR, (addr_t)V3_PAddr(msr_entries->host_msrs));
- exit_store_msrs[0].index = IA32_STAR_MSR;
- exit_store_msrs[1].index = IA32_LSTAR_MSR;
- exit_store_msrs[2].index = IA32_FMASK_MSR;
- exit_store_msrs[3].index = IA32_KERN_GS_BASE_MSR;
-
- memcpy(exit_store_msrs, exit_load_msrs, sizeof(struct vmcs_msr_entry) * 4);
- memcpy(exit_store_msrs, entry_load_msrs, sizeof(struct vmcs_msr_entry) * 4);
-
- v3_get_msr(IA32_STAR_MSR, &(exit_load_msrs[0].hi), &(exit_load_msrs[0].lo));
- v3_get_msr(IA32_LSTAR_MSR, &(exit_load_msrs[1].hi), &(exit_load_msrs[1].lo));
- v3_get_msr(IA32_FMASK_MSR, &(exit_load_msrs[2].hi), &(exit_load_msrs[2].lo));
- v3_get_msr(IA32_KERN_GS_BASE_MSR, &(exit_load_msrs[3].hi), &(exit_load_msrs[3].lo));
+ msr_ret |= v3_hook_msr(core->vm_info, IA32_STAR_MSR, NULL, NULL, NULL);
+ msr_ret |= v3_hook_msr(core->vm_info, IA32_LSTAR_MSR, NULL, NULL, NULL);
+ msr_ret |= v3_hook_msr(core->vm_info, IA32_FMASK_MSR, NULL, NULL, NULL);
+ msr_ret |= v3_hook_msr(core->vm_info, IA32_KERN_GS_BASE_MSR, NULL, NULL, NULL);
- msr_ret |= check_vmcs_write(VMCS_EXIT_MSR_STORE_ADDR, (addr_t)V3_PAddr(exit_store_msrs));
- msr_ret |= check_vmcs_write(VMCS_EXIT_MSR_LOAD_ADDR, (addr_t)V3_PAddr(exit_load_msrs));
- msr_ret |= check_vmcs_write(VMCS_ENTRY_MSR_LOAD_ADDR, (addr_t)V3_PAddr(entry_load_msrs));
+ // IMPORTANT: These MSRs appear to be cached by the hardware....
+ msr_ret |= v3_hook_msr(core->vm_info, SYSENTER_CS_MSR, NULL, NULL, NULL);
+ msr_ret |= v3_hook_msr(core->vm_info, SYSENTER_ESP_MSR, NULL, NULL, NULL);
+ msr_ret |= v3_hook_msr(core->vm_info, SYSENTER_EIP_MSR, NULL, NULL, NULL);
- v3_hook_msr(core->vm_info, IA32_STAR_MSR, NULL, NULL, NULL);
- v3_hook_msr(core->vm_info, IA32_LSTAR_MSR, NULL, NULL, NULL);
- v3_hook_msr(core->vm_info, IA32_FMASK_MSR, NULL, NULL, NULL);
- v3_hook_msr(core->vm_info, IA32_KERN_GS_BASE_MSR, NULL, NULL, NULL);
+ msr_ret |= v3_hook_msr(core->vm_info, FS_BASE_MSR, NULL, NULL, NULL);
+ msr_ret |= v3_hook_msr(core->vm_info, GS_BASE_MSR, NULL, NULL, NULL);
- // IMPORTANT: These SYSCALL MSRs are currently not handled by hardware or cached
- // We should really emulate these ourselves, or ideally include them in the MSR store area if there is room
- v3_hook_msr(core->vm_info, IA32_CSTAR_MSR, NULL, NULL, NULL);
- v3_hook_msr(core->vm_info, SYSENTER_CS_MSR, NULL, NULL, NULL);
- v3_hook_msr(core->vm_info, SYSENTER_ESP_MSR, NULL, NULL, NULL);
- v3_hook_msr(core->vm_info, SYSENTER_EIP_MSR, NULL, NULL, NULL);
-
- v3_hook_msr(core->vm_info, FS_BASE_MSR, NULL, NULL, NULL);
- v3_hook_msr(core->vm_info, GS_BASE_MSR, NULL, NULL, NULL);
-
+ // Not sure what to do about this... Does not appear to be an explicit hardware cache version...
+ msr_ret |= v3_hook_msr(core->vm_info, IA32_CSTAR_MSR, NULL, NULL, NULL);
+
+ if (msr_ret != 0) {
+ PrintError("Error configuring MSR save/restore area\n");
+ return -1;
+ }
+
}
return -1;
}
+ /*
if (v3_update_vmcs_host_state(core)) {
PrintError("Could not write host state\n");
return -1;
}
+ */
// reenable global interrupts for vm state initialization now
// that the vm state is initialized. If another VM kicks us off,
}
+int
+v3_vmx_schedule_timeout(struct guest_info * info)
+{
+ struct vmx_data * vmx_state = (struct vmx_data *)(info->vmm_data);
+ sint64_t cycles;
+ uint32_t timeout;
+
+ /* Check if the hardware supports an active timeout */
+#define VMX_ACTIVE_PREEMPT_TIMER_PIN 0x40
+ if (hw_info.pin_ctrls.req_mask & VMX_ACTIVE_PREEMPT_TIMER_PIN) {
+ /* The hardware doesn't support us modifying this pin control */
+ return 0;
+ }
+
+ /* Check if we have one to schedule and schedule it if we do */
+ cycles = (sint64_t)info->time_state.next_timeout - (sint64_t)v3_get_guest_time(&info->time_state);
+ if (info->time_state.next_timeout == (ullong_t) -1) {
+ timeout = 0;
+ vmx_state->pin_ctrls.active_preempt_timer = 0;
+ } else if (cycles < 0) {
+ /* set the timeout to 0 to force an immediate re-exit since it expired between
+ * when we checked a timeout and now. IF SOMEONE CONTINAULLY SETS A SHORT TIMEOUT,
+ * THIS CAN LOCK US OUT OF THE GUEST! */
+ timeout = 0;
+ vmx_state->pin_ctrls.active_preempt_timer = 1;
+ } else {
+ /* The hardware supports scheduling a timeout, and we have one to
+ * schedule */
+ timeout = (uint32_t)cycles >> hw_info.misc_info.tsc_multiple;
+ vmx_state->pin_ctrls.active_preempt_timer = 1;
+ }
+
+ /* Actually program the timer based on the settings above. */
+ check_vmcs_write(VMCS_PREEMPT_TIMER, timeout);
+ check_vmcs_write(VMCS_PIN_CTRLS, vmx_state->pin_ctrls.value);
+ return 0;
+}
+
/*
* CAUTION and DANGER!!!
*
// Perform any additional yielding needed for time adjustment
v3_adjust_time(info);
+ // Check for timeout - since this calls generic hooks in devices
+ // that may do things like pause the VM, it cannot be with interrupts
+ // disabled.
+ v3_check_timeout(info);
+
// disable global interrupts for vm state transition
v3_disable_ints();
// Update timer devices late after being in the VM so that as much
- // of hte time in the VM is accounted for as possible. Also do it before
+ // of the time in the VM is accounted for as possible. Also do it before
// updating IRQ entry state so that any interrupts the timers raise get
// handled on the next VM entry. Must be done with interrupts disabled.
v3_update_timers(info);
vmcs_write(VMCS_GUEST_CR3, guest_cr3);
}
+ // Update vmx active preemption timer to exit at the next timeout if
+ // the hardware supports it.
+ v3_vmx_schedule_timeout(info);
+
// Perform last-minute time bookkeeping prior to entering the VM
v3_time_enter_vm(info);
// This is handled in the atomic part of the vmx code,
// not in the generic (interruptable) vmx handler
break;
-
+ case VMEXIT_EXPIRED_PREEMPT_TIMER:
+ V3_Print("VMX Preempt Timer Expired.\n");
+ // This just forces an exit and is handled outside the switch
+ break;
default:
PrintError("Unhandled VMEXIT: %s (%u), %lu (0x%lx)\n",
/* Same as SVM */
static int update_map(struct v3_vm_info * vm, uint16_t port, int hook_read, int hook_write) {
- uchar_t * bitmap = (uint8_t *)(vm->io_map.arch_data);
+ uint8_t * bitmap = (uint8_t *)(vm->io_map.arch_data);
int major = port / 8;
int minor = port % 8;
PrintDebug("IN of %d bytes on port %d (0x%x)\n", read_size, io_qual.port, io_qual.port);
if (hook == NULL) {
- PrintDebug("IN operation on unhooked IO port 0x%x\n", io_qual.port);
+ PrintDebug("IN operation on unhooked IO port 0x%x - returning zeros\n", io_qual.port);
+ core->vm_regs.rax >>= 8*read_size;
+ core->vm_regs.rax <<= 8*read_size;
- /* What are the HW semantics for an IN on an invalid port?
- * Do we need to clear the register value or leave it untouched???
- */
} else {
if (hook->read(core, io_qual.port, &(core->vm_regs.rax), read_size, hook->priv_data) != read_size) {
PrintError("Read failure for IN on port %x\n", io_qual.port);
addr_t guest_va = exit_info->guest_linear_addr;
addr_t host_addr = 0;
int rdi_change = 0;
- ulong_t rep_num = 1;
+ uint32_t rep_num = 1;
struct rflags * flags = (struct rflags *)&(core->ctrl_regs.rflags);
hook = v3_get_io_hook(core->vm_info, io_qual.port);
do {
if (hook == NULL) {
- PrintDebug("INS operation on unhooked IO port 0x%x\n", io_qual.port);
+ PrintDebug("INS operation on unhooked IO port 0x%x - returning zeros\n", io_qual.port);
- /* What are the HW semantics for an INS on an invalid port?
- * Do we need to clear the memory region or leave it untouched???
- */
+ memset((char*)host_addr,0,read_size);
+
} else {
if (hook->read(core, io_qual.port, (char *)host_addr, read_size, hook->priv_data) != read_size) {
PrintError("Read Failure for INS on port 0x%x\n", io_qual.port);
PrintDebug("OUT of %d bytes on port %d (0x%x)\n", write_size, io_qual.port, io_qual.port);
if (hook == NULL) {
- PrintDebug("OUT operation on unhooked IO port 0x%x\n", io_qual.port);
+ PrintDebug("OUT operation on unhooked IO port 0x%x - ignored\n", io_qual.port);
} else {
if (hook->write(core, io_qual.port, &(core->vm_regs.rax), write_size, hook->priv_data) != write_size) {
PrintError("Write failure for out on port %x\n",io_qual.port);
addr_t guest_va = exit_info->guest_linear_addr;
addr_t host_addr;
int rsi_change;
- ulong_t rep_num = 1;
+ uint32_t rep_num = 1;
struct rflags * flags = (struct rflags *)&(core->ctrl_regs.rflags);
hook = v3_get_io_hook(core->vm_info, io_qual.port);
do {
if (hook == NULL) {
- PrintDebug("OUTS operation on unhooked IO port 0x%x\n", io_qual.port);
+ PrintDebug("OUTS operation on unhooked IO port 0x%x - ignored\n", io_qual.port);
} else {
if (hook->write(core, io_qual.port, (char *)host_addr, write_size, hook->priv_data) != write_size) {
PrintError("Read failure for INS on port 0x%x\n", io_qual.port);
uint8_t write_val = (hook_writes) ? 0x1 : 0x0;
uint8_t * bitmap = (uint8_t *)(vm->msr_map.arch_data);
+ if (index == -1) {
+ // 0rintError("Error updating MSR Map failed bitmap index for (0x%x)\n", msr);
+ // MSRs not in the bitmap covered range will always trigger exits, so we don't need to worry about them here.
+ return 0;
+ }
+
*(bitmap + major) &= ~(mask << minor);
*(bitmap + major) |= (read_val << minor);
struct vnet_dev * dev = NULL;
list_for_each_entry(dev, &(vnet_state.devs), node) {
- int dev_id = dev->dev_id;
-
- if (dev_id == idx)
+ if (dev->dev_id == idx) {
return dev;
+ }
}
return NULL;
/* delete all route entries with specfied src or dst device id */
static void inline del_routes_by_dev(int dev_id){
- struct vnet_route_info * route = NULL;
+ struct vnet_route_info * route, *tmp_route;
unsigned long flags;
flags = vnet_lock_irqsave(vnet_state.lock);
- list_for_each_entry(route, &(vnet_state.routes), node) {
+ list_for_each_entry_safe(route, tmp_route, &(vnet_state.routes), node) {
if((route->route_def.dst_type == LINK_INTERFACE &&
route->route_def.dst_id == dev_id) ||
(route->route_def.src_type == LINK_INTERFACE &&
}
static void deinit_devices_list(){
- struct vnet_dev * dev = NULL;
+ struct vnet_dev * dev, * tmp;
- list_for_each_entry(dev, &(vnet_state.devs), node) {
+ list_for_each_entry_safe(dev, tmp, &(vnet_state.devs), node) {
list_del(&(dev->node));
Vnet_Free(dev);
}
}
static void deinit_routes_list(){
- struct vnet_route_info * route = NULL;
+ struct vnet_route_info * route, * tmp;
- list_for_each_entry(route, &(vnet_state.routes), node) {
+ list_for_each_entry_safe(route, tmp, &(vnet_state.routes), node) {
list_del(&(route->node));
list_del(&(route->match_node));
Vnet_Free(route);