2 * This file is part of the Palacios Virtual Machine Monitor developed
3 * by the V3VEE Project with funding from the United States National
4 * Science Foundation and the Department of Energy.
6 * The V3VEE Project is a joint project between Northwestern University
7 * and the University of New Mexico. You can find out more at
10 * Copyright (c) 2008, The V3VEE Project <http://www.v3vee.org>
11 * All rights reserved.
13 * Author: Alexander Kudryavtsev <alexk@ispras.ru>
14 * Implementation of FW_CFG interface
15 * Author: Jack Lange <jacklange@cs.pitt.edu>
18 * This is free software. You are permitted to use,
19 * redistribute, and modify it as specified in the file "V3VEE_LICENSE".
22 #include <palacios/vmm_fw_cfg.h>
23 #include <palacios/vmm_mem.h>
24 #include <palacios/vmm.h>
25 #include <palacios/vm_guest.h>
29 This subsystem of Palacios interacts with the SEABIOS in order to
30 create highly customized configurations for the guest. Currently,
31 the primary purpose of such configuration is to pass a NUMA configuration
32 to the guest via ACPI. Currently, we are able to create NUMA domains,
33 map regions of guest physical addresses to them, and map vcores to them.
34 Additionally, these virtual NUMA domains are then mapped to physical
35 (host) NUMA domains. Other elements of Palacios handle vcore to
36 physical core mapping, as well as guest memory allocation such that
37 the needed physical NUMA domain mapping is correct.
39 The following describes how the XML configuration of a virtual NUMA guest
42 <mem_layout vnodes=n> (How many numa domains the guest will see)
43 (guest physical addresses x to y-1 are numa domain i and
44 numa domain i is mapped to host numa domain j)
45 <region vnode=i start_addr=x end_addr=y node=j>
49 For example, a 4 virtual domain guest mapped toa 2 domain host:
51 <mem_layout vnodes="4">
52 <region vnode="0" start_addr="0x00000000" end_addr="0x10000000" node="0" />
53 <region vnode="1" start_addr="0x10000000" end_addr="0x20000000" node="1" />
54 <region vnode="2" start_addr="0x20000000" end_addr="0x30000000" node="0" />
55 <region vnode="3" start_addr="0x30000000" end_addr="0x40000000" node="1" />
58 You also need to map the virtual cores to the domains, which is
59 done with the <cores> tag. This usually also indicates which physical core
60 the virtual core maps to, so that the NUMA topology the guest sees has
61 performance characteristics that make sense.
63 <cores count=m> (How many virtual cores we have)
64 <core vnode=i target_cpu=q> (vcore 0 maps to virtual numa zone i and pcore q)
65 <core vnode=j target_cpu=r> (vcore 1 maps to virtual numa zone j and pcore r)
69 For example, here are 8 virtual cores maped across our numa domains, pairwise
72 <core target_cpu="1" vnode="0"/>
73 <core target_cpu="2" vnode="0"/>
74 <core target_cpu="3" vnode="1"/>
75 <core target_cpu="4" vnode="1"/>
76 <core target_cpu="5" vnode="2"/>
77 <core target_cpu="6" vnode="2"/>
78 <core target_cpu="7" vnode="3"/>
79 <core target_cpu="8" vnode="3"/>
86 #define FW_CFG_CTL_PORT 0x510
87 #define FW_CFG_DATA_PORT 0x511
89 #define FW_CFG_SIGNATURE 0x00
90 #define FW_CFG_ID 0x01
91 #define FW_CFG_UUID 0x02
92 #define FW_CFG_RAM_SIZE 0x03
93 #define FW_CFG_NOGRAPHIC 0x04
94 #define FW_CFG_NB_CPUS 0x05
95 #define FW_CFG_MACHINE_ID 0x06
96 #define FW_CFG_KERNEL_ADDR 0x07
97 #define FW_CFG_KERNEL_SIZE 0x08
98 #define FW_CFG_KERNEL_CMDLINE 0x09
99 #define FW_CFG_INITRD_ADDR 0x0a
100 #define FW_CFG_INITRD_SIZE 0x0b
101 #define FW_CFG_BOOT_DEVICE 0x0c
102 #define FW_CFG_NUMA 0x0d
103 #define FW_CFG_BOOT_MENU 0x0e
104 #define FW_CFG_MAX_CPUS 0x0f
105 #define FW_CFG_KERNEL_ENTRY 0x10
106 #define FW_CFG_KERNEL_DATA 0x11
107 #define FW_CFG_INITRD_DATA 0x12
108 #define FW_CFG_CMDLINE_ADDR 0x13
109 #define FW_CFG_CMDLINE_SIZE 0x14
110 #define FW_CFG_CMDLINE_DATA 0x15
111 #define FW_CFG_SETUP_ADDR 0x16
112 #define FW_CFG_SETUP_SIZE 0x17
113 #define FW_CFG_SETUP_DATA 0x18
114 #define FW_CFG_FILE_DIR 0x19
116 #define FW_CFG_WRITE_CHANNEL 0x4000
117 #define FW_CFG_ARCH_LOCAL 0x8000
118 #define FW_CFG_ENTRY_MASK ~(FW_CFG_WRITE_CHANNEL | FW_CFG_ARCH_LOCAL)
120 #define FW_CFG_ACPI_TABLES (FW_CFG_ARCH_LOCAL + 0)
121 #define FW_CFG_SMBIOS_ENTRIES (FW_CFG_ARCH_LOCAL + 1)
122 #define FW_CFG_IRQ0_OVERRIDE (FW_CFG_ARCH_LOCAL + 2)
123 #define FW_CFG_E820_TABLE (FW_CFG_ARCH_LOCAL + 3)
124 #define FW_CFG_HPET (FW_CFG_ARCH_LOCAL + 4)
126 #define FW_CFG_INVALID 0xffff
135 E820_TYPE_ACPI_RECL = 3,
136 E820_TYPE_ACPI_NVS = 4,
140 #define E820_MAX_COUNT 128
141 struct e820_entry_packed {
145 } __attribute__((packed));
149 struct e820_entry_packed entry[E820_MAX_COUNT];
150 } __attribute__((packed)) __attribute((__aligned__(4)));
154 static int fw_cfg_add_bytes(struct v3_fw_cfg_state * cfg_state, uint16_t key, uint8_t * data, uint32_t len)
156 int arch = !!(key & FW_CFG_ARCH_LOCAL);
157 // JRL: Well this is demented... Its basically generating a 1 or 0 from a mask operation
159 key &= FW_CFG_ENTRY_MASK;
161 if (key >= FW_CFG_MAX_ENTRY) {
165 cfg_state->entries[arch][key].data = data;
166 cfg_state->entries[arch][key].len = len;
171 static int fw_cfg_add_i16(struct v3_fw_cfg_state * cfg_state, uint16_t key, uint16_t value)
173 uint16_t * copy = NULL;
175 copy = V3_Malloc(sizeof(uint16_t));
177 return fw_cfg_add_bytes(cfg_state, key, (uint8_t *)copy, sizeof(uint16_t));
180 static int fw_cfg_add_i32(struct v3_fw_cfg_state * cfg_state, uint16_t key, uint32_t value)
182 uint32_t * copy = NULL;
184 copy = V3_Malloc(sizeof(uint32_t));
186 return fw_cfg_add_bytes(cfg_state, key, (uint8_t *)copy, sizeof(uint32_t));
189 static int fw_cfg_add_i64(struct v3_fw_cfg_state * cfg_state, uint16_t key, uint64_t value)
191 uint64_t * copy = NULL;
193 copy = V3_Malloc(sizeof(uint64_t));
195 return fw_cfg_add_bytes(cfg_state, key, (uint8_t *)copy, sizeof(uint64_t));
198 static int fw_cfg_ctl_read(struct guest_info * core, uint16_t port, void * src, uint_t length, void * priv_data) {
202 static int fw_cfg_ctl_write(struct guest_info * core, uint16_t port, void * src, uint_t length, void * priv_data) {
203 V3_ASSERT(core->vm_info, core, length == 2);
205 struct v3_fw_cfg_state * cfg_state = (struct v3_fw_cfg_state *)priv_data;
206 uint16_t key = *(uint16_t *)src;
209 cfg_state->cur_offset = 0;
211 if ((key & FW_CFG_ENTRY_MASK) >= FW_CFG_MAX_ENTRY) {
212 cfg_state->cur_entry = FW_CFG_INVALID;
215 cfg_state->cur_entry = key;
223 static int fw_cfg_data_read(struct guest_info * core, uint16_t port, void * src, uint_t length, void * priv_data) {
224 V3_ASSERT(core->vm_info, core, length == 1);
226 struct v3_fw_cfg_state * cfg_state = (struct v3_fw_cfg_state *)priv_data;
227 int arch = !!(cfg_state->cur_entry & FW_CFG_ARCH_LOCAL);
228 struct v3_fw_cfg_entry * cfg_entry = &cfg_state->entries[arch][cfg_state->cur_entry & FW_CFG_ENTRY_MASK];
231 if ( (cfg_state->cur_entry == FW_CFG_INVALID) ||
232 (cfg_entry->data == NULL) ||
233 (cfg_state->cur_offset >= cfg_entry->len)) {
237 ret = cfg_entry->data[cfg_state->cur_offset++];
240 *(uint8_t *)src = ret;
245 static int fw_cfg_data_write(struct guest_info * core, uint16_t port, void * src, uint_t length, void * priv_data) {
246 V3_ASSERT(core->vm_info, core, length == 1);
248 struct v3_fw_cfg_state * cfg_state = (struct v3_fw_cfg_state *)priv_data;
249 int arch = !!(cfg_state->cur_entry & FW_CFG_ARCH_LOCAL);
250 struct v3_fw_cfg_entry * cfg_entry = &cfg_state->entries[arch][cfg_state->cur_entry & FW_CFG_ENTRY_MASK];
252 if ( (cfg_state->cur_entry & FW_CFG_WRITE_CHANNEL) &&
253 (cfg_entry->callback != NULL) &&
254 (cfg_state->cur_offset < cfg_entry->len)) {
256 cfg_entry->data[cfg_state->cur_offset++] = *(uint8_t *)src;
258 if (cfg_state->cur_offset == cfg_entry->len) {
259 cfg_entry->callback(cfg_entry->callback_opaque, cfg_entry->data);
260 cfg_state->cur_offset = 0;
267 static struct e820_table * e820_populate(struct v3_vm_info * vm) {
268 struct v3_e820_entry * entry = NULL;
269 struct e820_table * e820 = NULL;
272 if (vm->mem_map.e820_count > E820_MAX_COUNT) {
273 PrintError(vm, VCORE_NONE,"Too much E820 table entries! (max is %d)\n", E820_MAX_COUNT);
277 e820 = V3_Malloc(sizeof(struct e820_table));
280 PrintError(vm, VCORE_NONE, "Out of memory!\n");
284 e820->count = vm->mem_map.e820_count;
286 list_for_each_entry(entry, &vm->mem_map.e820_list, list) {
287 e820->entry[i].addr = e->addr;
288 e820->entry[i].size = e->size;
289 e820->entry[i].type = e->type;
297 void v3_fw_cfg_deinit(struct v3_vm_info *vm) {
298 struct v3_fw_cfg_state * cfg_state = &(vm->fw_cfg_state);
301 for (i = 0; i < 2; ++i) {
302 for (j = 0; j < FW_CFG_MAX_ENTRY; ++j) {
303 if (cfg_state->entries[i][j].data != NULL)
304 V3_Free(cfg_state->entries[i][j].data);
308 v3_unhook_io_port(vm, FW_CFG_CTL_PORT);
309 v3_unhook_io_port(vm, FW_CFG_DATA_PORT);
313 int v3_fw_cfg_init(struct v3_vm_info * vm) {
317 struct v3_fw_cfg_state * cfg_state = &(vm->fw_cfg_state);
321 #ifndef V3_CONFIG_SEABIOS
322 V3_Print(vm,VCORE_NONE,"Warning: Configuring SEABIOS firmware, but SEABIOS is not being used in this build of Palacios. Configuration will be dormant.\n");
326 struct e820_table * e820 = e820_populate(vm);
329 PrintError(vm, VCORE_NONE, "Failed to populate E820 for FW interface!\n");
336 ret |= v3_hook_io_port(vm, FW_CFG_CTL_PORT, fw_cfg_ctl_read, &fw_cfg_ctl_write, cfg_state);
337 ret |= v3_hook_io_port(vm, FW_CFG_DATA_PORT, fw_cfg_data_read, &fw_cfg_data_write, cfg_state);
341 PrintError(vm, VCORE_NONE, "Failed to hook FW CFG ports!\n");
342 v3_fw_cfg_deinit(vm);
346 fw_cfg_add_bytes(cfg_state, FW_CFG_SIGNATURE, (uint8_t *)"QEMU", 4);
347 //fw_cfg_add_bytes(cfg_state, FW_CFG_UUID, qemu_uuid, 16);
348 fw_cfg_add_i16(cfg_state, FW_CFG_NOGRAPHIC, /*(uint16_t)(display_type == DT_NOGRAPHIC)*/ 0);
349 fw_cfg_add_i16(cfg_state, FW_CFG_NB_CPUS, (uint16_t)vm->num_cores);
350 fw_cfg_add_i16(cfg_state, FW_CFG_MAX_CPUS, (uint16_t)vm->num_cores);
351 fw_cfg_add_i16(cfg_state, FW_CFG_BOOT_MENU, (uint16_t)1);
352 //fw_cfg_bootsplash(cfg_state);
354 fw_cfg_add_i32(cfg_state, FW_CFG_ID, 1);
355 fw_cfg_add_i64(cfg_state, FW_CFG_RAM_SIZE, (uint64_t)vm->mem_size / (1024 * 1024));
357 //fw_cfg_add_bytes(cfg_state, FW_CFG_ACPI_TABLES, (uint8_t *)acpi_tables,
360 fw_cfg_add_i32(cfg_state, FW_CFG_IRQ0_OVERRIDE, 1);
363 smbios_table = smbios_get_table(&smbios_len);
366 fw_cfg_add_bytes(cfg_state, FW_CFG_SMBIOS_ENTRIES,
367 smbios_table, smbios_len);
370 fw_cfg_add_bytes(cfg_state, FW_CFG_E820_TABLE, (uint8_t *)e820,
371 sizeof(struct e820_table));
373 fw_cfg_add_bytes(cfg_state, FW_CFG_HPET, (uint8_t *)&hpet_cfg,
374 sizeof(struct hpet_fw_config));
381 v3_cfg_tree_t * layout_cfg = v3_cfg_subtree(vm->cfg_data->cfg, "mem_layout");
382 char * num_nodes_str = v3_cfg_val(layout_cfg, "vnodes");
385 /* locations in fw_cfg NUMA array for each info region. */
388 int mem_offset = 1 + vm->num_cores;
391 num_nodes = atoi(num_nodes_str);
395 uint64_t * numa_fw_cfg = NULL;
398 // Allocate the global NUMA configuration array
399 numa_fw_cfg = V3_Malloc((1 + vm->num_cores + num_nodes) * sizeof(uint64_t));
401 if (numa_fw_cfg == NULL) {
402 PrintError(vm, VCORE_NONE, "Could not allocate fw_cfg NUMA config space\n");
403 v3_fw_cfg_deinit(vm);
407 memset(numa_fw_cfg, 0, (1 + vm->num_cores + num_nodes) * sizeof(uint64_t));
409 // First 8 bytes is the number of NUMA zones
410 numa_fw_cfg[node_offset] = num_nodes;
413 // Next region is array of core->node mappings
414 for (i = 0; i < vm->num_cores; i++) {
415 char * vnode_str = v3_cfg_val(vm->cores[i].core_cfg_data, "vnode");
417 if (vnode_str == NULL) {
418 // if no cpu was specified then NUMA layout is randomized, and we're screwed...
419 numa_fw_cfg[core_offset + i] = 0;
421 numa_fw_cfg[core_offset + i] = (uint64_t)atoi(vnode_str);
427 /* Final region is an array of node->mem_size mappings
428 * this assumes that memory is assigned to NUMA nodes in consecutive AND contiguous blocks
429 * NO INTERLEAVING ALLOWED
430 * e.g. node 0 points to the first x bytes of memory, node 1 points to the next y bytes, etc
431 * The array only stores the x,y,... values, indexed by the node ID
432 * We should probably fix this, but that will require modifications to SEABIOS
435 * For now we will assume that the xml data is set accordingly, so we will just walk through the mem regions specified there.
436 * NOTE: This will overwrite configurations if multiple xml regions are defined for each node
440 v3_cfg_tree_t * region_desc = v3_cfg_subtree(layout_cfg, "region");
442 while (region_desc) {
443 char * start_addr_str = v3_cfg_val(region_desc, "start_addr");
444 char * end_addr_str = v3_cfg_val(region_desc, "end_addr");
445 char * vnode_id_str = v3_cfg_val(region_desc, "vnode");
447 addr_t start_addr = 0;
451 if ((!start_addr_str) || (!end_addr_str) || (!vnode_id_str)) {
452 PrintError(vm, VCORE_NONE, "Invalid memory layout in configuration\n");
453 v3_fw_cfg_deinit(vm);
457 start_addr = atox(start_addr_str);
458 end_addr = atox(end_addr_str);
459 vnode_id = atoi(vnode_id_str);
461 numa_fw_cfg[mem_offset + vnode_id] = end_addr - start_addr;
463 region_desc = v3_cfg_next_branch(region_desc);
468 /* Print the NUMA mapping being passed in */
470 uint64_t region_start = 0;
472 V3_Print(vm, VCORE_NONE, "NUMA CONFIG: (nodes=%llu)\n", numa_fw_cfg[0]);
474 for (i = 0; i < vm->num_cores; i++) {
475 V3_Print(vm, VCORE_NONE, "\tCore %d -> Node %llu\n", i, numa_fw_cfg[core_offset + i]);
478 for (i = 0; i < num_nodes; i++) {
479 V3_Print(vm, VCORE_NONE, "\tMem (%p - %p) -> Node %d\n", (void *)region_start,
480 (void *)numa_fw_cfg[mem_offset + i], i);
482 region_start += numa_fw_cfg[mem_offset + i];
487 // Register the NUMA cfg array with the FW_CFG interface
488 fw_cfg_add_bytes(cfg_state, FW_CFG_NUMA, (uint8_t *)numa_fw_cfg,
489 (1 + vm->num_cores + num_nodes) * sizeof(uint64_t));
501 /* E820 code for HVM enabled bochs bios: */
503 /* E820 location in HVM virtual address space. Taken from VMXASSIST. */
504 #define HVM_E820_PAGE 0x00090000
505 #define HVM_E820_NR_OFFSET 0x000001E8
506 #define HVM_E820_OFFSET 0x000002D0
507 // Copy E820 to BIOS. See rombios.c, copy_e820_table function.
508 addr_t e820_ptr = (addr_t)V3_VAddr((void *)(vm->mem_map.base_region.host_addr + HVM_E820_PAGE));
510 *(uint16_t *)(e820_ptr + HVM_E820_NR_OFFSET) = e820->count;
511 memcpy((void *)(e820_ptr + HVM_E820_OFFSET), &e820->entry[0], sizeof(e820->entry[0]) * e820->count);