2 * This file is part of the Palacios Virtual Machine Monitor developed
3 * by the V3VEE Project with funding from the United States National
4 * Science Foundation and the Department of Energy.
6 * The V3VEE Project is a joint project between Northwestern University
7 * and the University of New Mexico. You can find out more at
10 * Copyright (c) 2008, The V3VEE Project <http://www.v3vee.org>
11 * All rights reserved.
13 * Author: Alexander Kudryavtsev <alexk@ispras.ru>
14 * Implementation of FW_CFG interface
15 * Author: Jack Lange <jacklange@cs.pitt.edu>
18 * This is free software. You are permitted to use,
19 * redistribute, and modify it as specified in the file "V3VEE_LICENSE".
22 #include <palacios/vmm_fw_cfg.h>
23 #include <palacios/vmm_mem.h>
24 #include <palacios/vmm.h>
25 #include <palacios/vm_guest.h>
29 This subsystem of Palacios interacts with the SEABIOS in order to
30 create highly customized configurations for the guest. Currently,
31 the primary purpose of such configuration is to pass a NUMA configuration
32 to the guest via ACPI. Currently, we are able to create NUMA domains,
33 map regions of guest physical addresses to them, and map vcores to them.
34 Additionally, these virtual NUMA domains are then mapped to physical
35 (host) NUMA domains. Other elements of Palacios handle vcore to
36 physical core mapping, as well as guest memory allocation such that
37 the needed physical NUMA domain mapping is correct.
39 The following describes how the XML configuration of a virtual NUMA guest
42 <mem_layout vnodes=n> (How many numa domains the guest will see)
43 (guest physical addresses x to y-1 are numa domain i and
44 numa domain i is mapped to host numa domain j)
45 <region vnode=i start_addr=x end_addr=y node=j>
49 For example, a 4 virtual domain guest mapped toa 2 domain host:
51 <mem_layout vnodes="4">
52 <region vnode="0" start_addr="0x00000000" end_addr="0x10000000" node="0" />
53 <region vnode="1" start_addr="0x10000000" end_addr="0x20000000" node="1" />
54 <region vnode="2" start_addr="0x20000000" end_addr="0x30000000" node="0" />
55 <region vnode="3" start_addr="0x30000000" end_addr="0x40000000" node="1" />
58 You also need to map the virtual cores to the domains, which is
59 done with the <cores> tag. This usually also indicates which physical core
60 the virtual core maps to, so that the NUMA topology the guest sees has
61 performance characteristics that make sense.
63 <cores count=m> (How many virtual cores we have)
64 <core vnode=i target_cpu=q> (vcore 0 maps to virtual numa zone i and pcore q)
65 <core vnode=j target_cpu=r> (vcore 1 maps to virtual numa zone j and pcore r)
69 For example, here are 8 virtual cores maped across our numa domains, pairwise
72 <core target_cpu="1" vnode="0"/>
73 <core target_cpu="2" vnode="0"/>
74 <core target_cpu="3" vnode="1"/>
75 <core target_cpu="4" vnode="1"/>
76 <core target_cpu="5" vnode="2"/>
77 <core target_cpu="6" vnode="2"/>
78 <core target_cpu="7" vnode="3"/>
79 <core target_cpu="8" vnode="3"/>
86 #define FW_CFG_CTL_PORT 0x510
87 #define FW_CFG_DATA_PORT 0x511
89 #define FW_CFG_SIGNATURE 0x00
90 #define FW_CFG_ID 0x01
91 #define FW_CFG_UUID 0x02
92 #define FW_CFG_RAM_SIZE 0x03
93 #define FW_CFG_NOGRAPHIC 0x04
94 #define FW_CFG_NB_CPUS 0x05
95 #define FW_CFG_MACHINE_ID 0x06
96 #define FW_CFG_KERNEL_ADDR 0x07
97 #define FW_CFG_KERNEL_SIZE 0x08
98 #define FW_CFG_KERNEL_CMDLINE 0x09
99 #define FW_CFG_INITRD_ADDR 0x0a
100 #define FW_CFG_INITRD_SIZE 0x0b
101 #define FW_CFG_BOOT_DEVICE 0x0c
102 #define FW_CFG_NUMA 0x0d
103 #define FW_CFG_BOOT_MENU 0x0e
104 #define FW_CFG_MAX_CPUS 0x0f
105 #define FW_CFG_KERNEL_ENTRY 0x10
106 #define FW_CFG_KERNEL_DATA 0x11
107 #define FW_CFG_INITRD_DATA 0x12
108 #define FW_CFG_CMDLINE_ADDR 0x13
109 #define FW_CFG_CMDLINE_SIZE 0x14
110 #define FW_CFG_CMDLINE_DATA 0x15
111 #define FW_CFG_SETUP_ADDR 0x16
112 #define FW_CFG_SETUP_SIZE 0x17
113 #define FW_CFG_SETUP_DATA 0x18
114 #define FW_CFG_FILE_DIR 0x19
116 #define FW_CFG_WRITE_CHANNEL 0x4000
117 #define FW_CFG_ARCH_LOCAL 0x8000
118 #define FW_CFG_ENTRY_MASK ~(FW_CFG_WRITE_CHANNEL | FW_CFG_ARCH_LOCAL)
120 #define FW_CFG_ACPI_TABLES (FW_CFG_ARCH_LOCAL + 0)
121 #define FW_CFG_SMBIOS_ENTRIES (FW_CFG_ARCH_LOCAL + 1)
122 #define FW_CFG_IRQ0_OVERRIDE (FW_CFG_ARCH_LOCAL + 2)
123 #define FW_CFG_E820_TABLE (FW_CFG_ARCH_LOCAL + 3)
124 #define FW_CFG_HPET (FW_CFG_ARCH_LOCAL + 4)
126 #define FW_CFG_INVALID 0xffff
135 E820_TYPE_ACPI_RECL = 3,
136 E820_TYPE_ACPI_NVS = 4,
140 #define E820_MAX_COUNT 128
141 struct e820_entry_packed {
145 } __attribute__((packed));
149 struct e820_entry_packed entry[E820_MAX_COUNT];
150 } __attribute__((packed)) __attribute((__aligned__(4)));
155 // Internal version assumes data is allocated
157 static int fw_cfg_add_bytes_internal(struct v3_fw_cfg_state * cfg_state, uint16_t key, uint8_t * data, uint32_t len)
159 int arch = !!(key & FW_CFG_ARCH_LOCAL);
160 // JRL: Well this is demented... Its basically generating a 1 or 0 from a mask operation
162 key &= FW_CFG_ENTRY_MASK;
164 if (key >= FW_CFG_MAX_ENTRY) {
168 cfg_state->entries[arch][key].data = data;
169 cfg_state->entries[arch][key].len = len;
175 // General purpose version will allocate a temp
178 static int fw_cfg_add_bytes(struct v3_fw_cfg_state * cfg_state, uint16_t key, uint8_t * data, uint32_t len)
180 // must make a copy of the data so that the deinit function will work correctly...
182 uint16_t * copy = NULL;
184 copy = V3_Malloc(len);
186 PrintError(VM_NONE,VCORE_NONE,"Failed to allocate temp\n");
189 memcpy(copy,data,len);
190 return fw_cfg_add_bytes_internal(cfg_state, key, (uint8_t *)copy, sizeof(uint16_t));
193 static int fw_cfg_add_i16(struct v3_fw_cfg_state * cfg_state, uint16_t key, uint16_t value)
195 uint16_t * copy = NULL;
197 copy = V3_Malloc(sizeof(uint16_t));
199 PrintError(VM_NONE,VCORE_NONE,"Failed to allocate temp\n");
203 return fw_cfg_add_bytes_internal(cfg_state, key, (uint8_t *)copy, sizeof(uint16_t));
206 static int fw_cfg_add_i32(struct v3_fw_cfg_state * cfg_state, uint16_t key, uint32_t value)
208 uint32_t * copy = NULL;
210 copy = V3_Malloc(sizeof(uint32_t));
212 PrintError(VM_NONE,VCORE_NONE,"Failed to allocate temp\n");
216 return fw_cfg_add_bytes_internal(cfg_state, key, (uint8_t *)copy, sizeof(uint32_t));
219 static int fw_cfg_add_i64(struct v3_fw_cfg_state * cfg_state, uint16_t key, uint64_t value)
221 uint64_t * copy = NULL;
223 copy = V3_Malloc(sizeof(uint64_t));
225 PrintError(VM_NONE,VCORE_NONE,"Failed to allocate temp\n");
229 return fw_cfg_add_bytes_internal(cfg_state, key, (uint8_t *)copy, sizeof(uint64_t));
232 static int fw_cfg_ctl_read(struct guest_info * core, uint16_t port, void * src, uint_t length, void * priv_data) {
236 static int fw_cfg_ctl_write(struct guest_info * core, uint16_t port, void * src, uint_t length, void * priv_data) {
237 V3_ASSERT(core->vm_info, core, length == 2);
239 struct v3_fw_cfg_state * cfg_state = (struct v3_fw_cfg_state *)priv_data;
240 uint16_t key = *(uint16_t *)src;
243 cfg_state->cur_offset = 0;
245 if ((key & FW_CFG_ENTRY_MASK) >= FW_CFG_MAX_ENTRY) {
246 cfg_state->cur_entry = FW_CFG_INVALID;
249 cfg_state->cur_entry = key;
257 static int fw_cfg_data_read(struct guest_info * core, uint16_t port, void * src, uint_t length, void * priv_data) {
258 V3_ASSERT(core->vm_info, core, length == 1);
260 struct v3_fw_cfg_state * cfg_state = (struct v3_fw_cfg_state *)priv_data;
261 int arch = !!(cfg_state->cur_entry & FW_CFG_ARCH_LOCAL);
262 struct v3_fw_cfg_entry * cfg_entry = &cfg_state->entries[arch][cfg_state->cur_entry & FW_CFG_ENTRY_MASK];
265 if ( (cfg_state->cur_entry == FW_CFG_INVALID) ||
266 (cfg_entry->data == NULL) ||
267 (cfg_state->cur_offset >= cfg_entry->len)) {
271 ret = cfg_entry->data[cfg_state->cur_offset++];
274 *(uint8_t *)src = ret;
279 static int fw_cfg_data_write(struct guest_info * core, uint16_t port, void * src, uint_t length, void * priv_data) {
280 V3_ASSERT(core->vm_info, core, length == 1);
282 struct v3_fw_cfg_state * cfg_state = (struct v3_fw_cfg_state *)priv_data;
283 int arch = !!(cfg_state->cur_entry & FW_CFG_ARCH_LOCAL);
284 struct v3_fw_cfg_entry * cfg_entry = &cfg_state->entries[arch][cfg_state->cur_entry & FW_CFG_ENTRY_MASK];
286 if ( (cfg_state->cur_entry & FW_CFG_WRITE_CHANNEL) &&
287 (cfg_entry->callback != NULL) &&
288 (cfg_state->cur_offset < cfg_entry->len)) {
290 cfg_entry->data[cfg_state->cur_offset++] = *(uint8_t *)src;
292 if (cfg_state->cur_offset == cfg_entry->len) {
293 cfg_entry->callback(cfg_entry->callback_opaque, cfg_entry->data);
294 cfg_state->cur_offset = 0;
301 static struct e820_table * e820_populate(struct v3_vm_info * vm) {
302 struct v3_e820_entry * entry = NULL;
303 struct e820_table * e820 = NULL;
306 if (vm->mem_map.e820_count > E820_MAX_COUNT) {
307 PrintError(vm, VCORE_NONE,"Too much E820 table entries! (max is %d)\n", E820_MAX_COUNT);
311 e820 = V3_Malloc(sizeof(struct e820_table));
314 PrintError(vm, VCORE_NONE, "Out of memory!\n");
318 e820->count = vm->mem_map.e820_count;
320 list_for_each_entry(entry, &vm->mem_map.e820_list, list) {
321 e820->entry[i].addr = e->addr;
322 e820->entry[i].size = e->size;
323 e820->entry[i].type = e->type;
331 void v3_fw_cfg_deinit(struct v3_vm_info *vm) {
332 struct v3_fw_cfg_state * cfg_state = &(vm->fw_cfg_state);
335 for (i = 0; i < 2; ++i) {
336 for (j = 0; j < FW_CFG_MAX_ENTRY; ++j) {
337 if (cfg_state->entries[i][j].data != NULL)
338 V3_Free(cfg_state->entries[i][j].data);
342 v3_unhook_io_port(vm, FW_CFG_CTL_PORT);
343 v3_unhook_io_port(vm, FW_CFG_DATA_PORT);
347 int v3_fw_cfg_init(struct v3_vm_info * vm) {
351 struct v3_fw_cfg_state * cfg_state = &(vm->fw_cfg_state);
353 uint64_t mem_size = vm->mem_size;
354 uint32_t num_cores = vm->num_cores;
357 mem_size = v3_get_hvm_ros_memsize(vm);
358 num_cores = v3_get_hvm_ros_cores(vm);
362 // Be paranoid about starting this as all "unallocated"
363 memset(cfg_state,0,sizeof(struct v3_fw_cfg_state));
365 #ifndef V3_CONFIG_SEABIOS
366 V3_Print(vm,VCORE_NONE,"Warning: Configuring SEABIOS firmware, but SEABIOS is not being used in this build of Palacios. Configuration will be dormant.\n");
370 struct e820_table * e820 = e820_populate(vm);
373 PrintError(vm, VCORE_NONE, "Failed to populate E820 for FW interface!\n");
380 ret |= v3_hook_io_port(vm, FW_CFG_CTL_PORT, fw_cfg_ctl_read, &fw_cfg_ctl_write, cfg_state);
381 ret |= v3_hook_io_port(vm, FW_CFG_DATA_PORT, fw_cfg_data_read, &fw_cfg_data_write, cfg_state);
385 PrintError(vm, VCORE_NONE, "Failed to hook FW CFG ports!\n");
386 v3_fw_cfg_deinit(vm);
390 fw_cfg_add_bytes(cfg_state, FW_CFG_SIGNATURE, (uint8_t *)"QEMU", 4);
391 //fw_cfg_add_bytes(cfg_state, FW_CFG_UUID, qemu_uuid, 16);
392 fw_cfg_add_i16(cfg_state, FW_CFG_NOGRAPHIC, /*(uint16_t)(display_type == DT_NOGRAPHIC)*/ 0);
393 fw_cfg_add_i16(cfg_state, FW_CFG_NB_CPUS, (uint16_t)num_cores);
394 fw_cfg_add_i16(cfg_state, FW_CFG_MAX_CPUS, (uint16_t)num_cores);
395 fw_cfg_add_i16(cfg_state, FW_CFG_BOOT_MENU, (uint16_t)1);
396 //fw_cfg_bootsplash(cfg_state);
398 fw_cfg_add_i32(cfg_state, FW_CFG_ID, 1);
399 fw_cfg_add_i64(cfg_state, FW_CFG_RAM_SIZE, mem_size / (1024 * 1024));
401 //fw_cfg_add_bytes(cfg_state, FW_CFG_ACPI_TABLES, (uint8_t *)acpi_tables,
404 fw_cfg_add_i32(cfg_state, FW_CFG_IRQ0_OVERRIDE, 1);
407 smbios_table = smbios_get_table(&smbios_len);
410 fw_cfg_add_bytes(cfg_state, FW_CFG_SMBIOS_ENTRIES,
411 smbios_table, smbios_len);
414 fw_cfg_add_bytes(cfg_state, FW_CFG_E820_TABLE, (uint8_t *)e820,
415 sizeof(struct e820_table));
417 fw_cfg_add_bytes(cfg_state, FW_CFG_HPET, (uint8_t *)&hpet_cfg,
418 sizeof(struct hpet_fw_config));
425 v3_cfg_tree_t * layout_cfg = v3_cfg_subtree(vm->cfg_data->cfg, "mem_layout");
426 char * num_nodes_str = v3_cfg_val(layout_cfg, "vnodes");
429 /* locations in fw_cfg NUMA array for each info region. */
432 int mem_offset = 1 + num_cores;
435 num_nodes = atoi(num_nodes_str);
439 uint64_t * numa_fw_cfg = NULL;
442 // Allocate the global NUMA configuration array
443 numa_fw_cfg = V3_Malloc((1 + num_cores + num_nodes) * sizeof(uint64_t));
445 if (numa_fw_cfg == NULL) {
446 PrintError(vm, VCORE_NONE, "Could not allocate fw_cfg NUMA config space\n");
447 v3_fw_cfg_deinit(vm);
451 memset(numa_fw_cfg, 0, (1 + num_cores + num_nodes) * sizeof(uint64_t));
453 // First 8 bytes is the number of NUMA zones
454 numa_fw_cfg[node_offset] = num_nodes;
457 // Next region is array of core->node mappings
458 for (i = 0; i < num_cores; i++) {
459 char * vnode_str = v3_cfg_val(vm->cores[i].core_cfg_data, "vnode");
461 if (vnode_str == NULL) {
462 // if no cpu was specified then NUMA layout is randomized, and we're screwed...
463 numa_fw_cfg[core_offset + i] = 0;
465 numa_fw_cfg[core_offset + i] = (uint64_t)atoi(vnode_str);
471 /* Final region is an array of node->mem_size mappings
472 * this assumes that memory is assigned to NUMA nodes in consecutive AND contiguous blocks
473 * NO INTERLEAVING ALLOWED
474 * e.g. node 0 points to the first x bytes of memory, node 1 points to the next y bytes, etc
475 * The array only stores the x,y,... values, indexed by the node ID
476 * We should probably fix this, but that will require modifications to SEABIOS
479 * For now we will assume that the xml data is set accordingly, so we will just walk through the mem regions specified there.
480 * NOTE: This will overwrite configurations if multiple xml regions are defined for each node
484 v3_cfg_tree_t * region_desc = v3_cfg_subtree(layout_cfg, "region");
486 while (region_desc) {
487 char * start_addr_str = v3_cfg_val(region_desc, "start_addr");
488 char * end_addr_str = v3_cfg_val(region_desc, "end_addr");
489 char * vnode_id_str = v3_cfg_val(region_desc, "vnode");
491 addr_t start_addr = 0;
495 if ((!start_addr_str) || (!end_addr_str) || (!vnode_id_str)) {
496 PrintError(vm, VCORE_NONE, "Invalid memory layout in configuration\n");
497 v3_fw_cfg_deinit(vm);
501 start_addr = atox(start_addr_str);
502 end_addr = atox(end_addr_str);
503 vnode_id = atoi(vnode_id_str);
505 numa_fw_cfg[mem_offset + vnode_id] = end_addr - start_addr;
507 region_desc = v3_cfg_next_branch(region_desc);
512 /* Print the NUMA mapping being passed in */
514 uint64_t region_start = 0;
516 V3_Print(vm, VCORE_NONE, "NUMA CONFIG: (nodes=%llu)\n", numa_fw_cfg[0]);
518 for (i = 0; i < num_cores; i++) {
519 V3_Print(vm, VCORE_NONE, "\tCore %d -> Node %llu\n", i, numa_fw_cfg[core_offset + i]);
522 for (i = 0; i < num_nodes; i++) {
523 V3_Print(vm, VCORE_NONE, "\tMem (%p - %p) -> Node %d\n", (void *)region_start,
524 (void *)numa_fw_cfg[mem_offset + i], i);
526 region_start += numa_fw_cfg[mem_offset + i];
531 // Register the NUMA cfg array with the FW_CFG interface
532 fw_cfg_add_bytes_internal(cfg_state, FW_CFG_NUMA, (uint8_t *)numa_fw_cfg,
533 (1 + num_cores + num_nodes) * sizeof(uint64_t));
545 /* E820 code for HVM enabled bochs bios: */
547 /* E820 location in HVM virtual address space. Taken from VMXASSIST. */
548 #define HVM_E820_PAGE 0x00090000
549 #define HVM_E820_NR_OFFSET 0x000001E8
550 #define HVM_E820_OFFSET 0x000002D0
551 // Copy E820 to BIOS. See rombios.c, copy_e820_table function.
552 addr_t e820_ptr = (addr_t)V3_VAddr((void *)(vm->mem_map.base_region.host_addr + HVM_E820_PAGE));
554 *(uint16_t *)(e820_ptr + HVM_E820_NR_OFFSET) = e820->count;
555 memcpy((void *)(e820_ptr + HVM_E820_OFFSET), &e820->entry[0], sizeof(e820->entry[0]) * e820->count);