palacios/src/palacios/vmm_fw_cfg.c

   1 /*
   2  * This file is part of the Palacios Virtual Machine Monitor developed
   3  * by the V3VEE Project with funding from the United States National
   4  * Science Foundation and the Department of Energy.
   5  *
   6  * The V3VEE Project is a joint project between Northwestern University
   7  * and the University of New Mexico.  You can find out more at
   8  * http://www.v3vee.org
   9  *
  10  * Copyright (c) 2008, The V3VEE Project <http://www.v3vee.org>
  11  * All rights reserved.
  12  *
  13  * Author: Alexander Kudryavtsev <alexk@ispras.ru>
  14  *         Implementation of FW_CFG interface
  15  * Author: Jack Lange <jacklange@cs.pitt.edu>
  16  *         NUMA modifications
  17  *
  18  * This is free software.  You are permitted to use,
  19  * redistribute, and modify it as specified in the file "V3VEE_LICENSE".
  20  */
  21
  22 #include <palacios/vmm_fw_cfg.h>
  23 #include <palacios/vmm_mem.h>
  24 #include <palacios/vmm.h>
  25 #include <palacios/vm_guest.h>
  26
  27
  28 #define FW_CFG_CTL_PORT     0x510
  29 #define FW_CFG_DATA_PORT    0x511
  30
  31 #define FW_CFG_SIGNATURE        0x00
  32 #define FW_CFG_ID               0x01
  33 #define FW_CFG_UUID             0x02
  34 #define FW_CFG_RAM_SIZE         0x03
  35 #define FW_CFG_NOGRAPHIC        0x04
  36 #define FW_CFG_NB_CPUS          0x05
  37 #define FW_CFG_MACHINE_ID       0x06
  38 #define FW_CFG_KERNEL_ADDR      0x07
  39 #define FW_CFG_KERNEL_SIZE      0x08
  40 #define FW_CFG_KERNEL_CMDLINE   0x09
  41 #define FW_CFG_INITRD_ADDR      0x0a
  42 #define FW_CFG_INITRD_SIZE      0x0b
  43 #define FW_CFG_BOOT_DEVICE      0x0c
  44 #define FW_CFG_NUMA             0x0d
  45 #define FW_CFG_BOOT_MENU        0x0e
  46 #define FW_CFG_MAX_CPUS         0x0f
  47 #define FW_CFG_KERNEL_ENTRY     0x10
  48 #define FW_CFG_KERNEL_DATA      0x11
  49 #define FW_CFG_INITRD_DATA      0x12
  50 #define FW_CFG_CMDLINE_ADDR     0x13
  51 #define FW_CFG_CMDLINE_SIZE     0x14
  52 #define FW_CFG_CMDLINE_DATA     0x15
  53 #define FW_CFG_SETUP_ADDR       0x16
  54 #define FW_CFG_SETUP_SIZE       0x17
  55 #define FW_CFG_SETUP_DATA       0x18
  56 #define FW_CFG_FILE_DIR         0x19
  57
  58 #define FW_CFG_WRITE_CHANNEL    0x4000
  59 #define FW_CFG_ARCH_LOCAL       0x8000
  60 #define FW_CFG_ENTRY_MASK       ~(FW_CFG_WRITE_CHANNEL | FW_CFG_ARCH_LOCAL)
  61
  62 #define FW_CFG_ACPI_TABLES (FW_CFG_ARCH_LOCAL + 0)
  63 #define FW_CFG_SMBIOS_ENTRIES (FW_CFG_ARCH_LOCAL + 1)
  64 #define FW_CFG_IRQ0_OVERRIDE (FW_CFG_ARCH_LOCAL + 2)
  65 #define FW_CFG_E820_TABLE (FW_CFG_ARCH_LOCAL + 3)
  66 #define FW_CFG_HPET (FW_CFG_ARCH_LOCAL + 4)
  67
  68 #define FW_CFG_INVALID          0xffff
  69
  70
  71
  72
  73 /*
  74 enum v3_e820_types {
  75     E820_TYPE_FREE      = 1,
  76     E820_TYPE_RESV      = 2,
  77     E820_TYPE_ACPI_RECL = 3,
  78     E820_TYPE_ACPI_NVS  = 4,
  79     E820_TYPE_BAD       = 5
  80 };
  81
  82 #define E820_MAX_COUNT 128
  83 struct e820_entry_packed {
  84     uint64_t addr;
  85     uint64_t size;
  86     uint32_t type;
  87 } __attribute__((packed));
  88
  89 struct e820_table {
  90     uint32_t count;
  91     struct e820_entry_packed entry[E820_MAX_COUNT];
  92 } __attribute__((packed)) __attribute((__aligned__(4)));
  93
  94 */
  95
  96 static int fw_cfg_add_bytes(struct v3_fw_cfg_state * cfg_state, uint16_t key, uint8_t * data, uint32_t len)
  97 {
  98     int arch = !!(key & FW_CFG_ARCH_LOCAL);
  99     // JRL: Well this is demented... Its basically generating a 1 or 0 from a mask operation
 100
 101     key &= FW_CFG_ENTRY_MASK;
 102
 103     if (key >= FW_CFG_MAX_ENTRY) {
 104         return 0;
 105     }
 106
 107     cfg_state->entries[arch][key].data = data;
 108     cfg_state->entries[arch][key].len = len;
 109
 110     return 1;
 111 }
 112
 113 static int fw_cfg_add_i16(struct v3_fw_cfg_state * cfg_state, uint16_t key, uint16_t value)
 114 {
 115     uint16_t * copy = NULL;
 116
 117     copy = V3_Malloc(sizeof(uint16_t));
 118     *copy = value;
 119     return fw_cfg_add_bytes(cfg_state, key, (uint8_t *)copy, sizeof(uint16_t));
 120 }
 121
 122 static int fw_cfg_add_i32(struct v3_fw_cfg_state * cfg_state, uint16_t key, uint32_t value)
 123 {
 124     uint32_t * copy = NULL;
 125
 126     copy = V3_Malloc(sizeof(uint32_t));
 127     *copy = value;
 128     return fw_cfg_add_bytes(cfg_state, key, (uint8_t *)copy, sizeof(uint32_t));
 129 }
 130
 131 static int fw_cfg_add_i64(struct v3_fw_cfg_state * cfg_state, uint16_t key, uint64_t value)
 132 {
 133     uint64_t * copy = NULL;
 134
 135     copy = V3_Malloc(sizeof(uint64_t));
 136     *copy = value;
 137     return fw_cfg_add_bytes(cfg_state, key, (uint8_t *)copy, sizeof(uint64_t));
 138 }
 139
 140 static int fw_cfg_ctl_read(struct guest_info * core, uint16_t port, void * src, uint_t length, void * priv_data) {
 141     return length;
 142 }
 143
 144 static int fw_cfg_ctl_write(struct guest_info * core, uint16_t port, void * src, uint_t length, void * priv_data) {
 145     V3_ASSERT(core->vm_info, core, length == 2);
 146
 147     struct v3_fw_cfg_state * cfg_state = (struct v3_fw_cfg_state *)priv_data;
 148     uint16_t key = *(uint16_t *)src;
 149     int ret = 0;
 150
 151     cfg_state->cur_offset = 0;
 152
 153     if ((key & FW_CFG_ENTRY_MASK) >= FW_CFG_MAX_ENTRY) {
 154         cfg_state->cur_entry = FW_CFG_INVALID;
 155         ret = 0;
 156     } else {
 157         cfg_state->cur_entry = key;
 158         ret = 1;
 159     }
 160
 161     return length;
 162 }
 163
 164
 165 static int fw_cfg_data_read(struct guest_info * core, uint16_t port, void * src, uint_t length, void * priv_data) {
 166     V3_ASSERT(core->vm_info, core, length == 1);
 167
 168     struct v3_fw_cfg_state * cfg_state = (struct v3_fw_cfg_state *)priv_data;
 169     int arch = !!(cfg_state->cur_entry & FW_CFG_ARCH_LOCAL);
 170     struct v3_fw_cfg_entry * cfg_entry = &cfg_state->entries[arch][cfg_state->cur_entry & FW_CFG_ENTRY_MASK];
 171     uint8_t ret;
 172
 173     if ( (cfg_state->cur_entry == FW_CFG_INVALID) ||
 174          (cfg_entry->data == NULL) ||
 175          (cfg_state->cur_offset >= cfg_entry->len)) {
 176
 177         ret = 0;
 178     } else {
 179         ret = cfg_entry->data[cfg_state->cur_offset++];
 180     }
 181
 182     *(uint8_t *)src = ret;
 183
 184     return length;
 185 }
 186
 187 static int fw_cfg_data_write(struct guest_info * core, uint16_t port, void * src, uint_t length, void * priv_data) {
 188     V3_ASSERT(core->vm_info, core, length == 1);
 189
 190     struct v3_fw_cfg_state * cfg_state = (struct v3_fw_cfg_state *)priv_data;
 191     int arch = !!(cfg_state->cur_entry & FW_CFG_ARCH_LOCAL);
 192     struct v3_fw_cfg_entry * cfg_entry = &cfg_state->entries[arch][cfg_state->cur_entry & FW_CFG_ENTRY_MASK];
 193
 194     if ( (cfg_state->cur_entry & FW_CFG_WRITE_CHANNEL) &&
 195          (cfg_entry->callback != NULL) &&
 196          (cfg_state->cur_offset < cfg_entry->len)) {
 197
 198         cfg_entry->data[cfg_state->cur_offset++] = *(uint8_t *)src;
 199
 200         if (cfg_state->cur_offset == cfg_entry->len) {
 201             cfg_entry->callback(cfg_entry->callback_opaque, cfg_entry->data);
 202             cfg_state->cur_offset = 0;
 203         }
 204     }
 205     return length;
 206 }
 207
 208 /*
 209 static struct e820_table * e820_populate(struct v3_vm_info * vm) {
 210     struct v3_e820_entry * entry = NULL;
 211     struct e820_table * e820 = NULL;
 212     int i = 0;
 213
 214     if (vm->mem_map.e820_count > E820_MAX_COUNT) {
 215         PrintError(vm, VCORE_NONE,"Too much E820 table entries! (max is %d)\n", E820_MAX_COUNT);
 216         return NULL;
 217     }
 218
 219     e820 = V3_Malloc(sizeof(struct e820_table));
 220
 221     if (e820 == NULL) {
 222         PrintError(vm, VCORE_NONE, "Out of memory!\n");
 223         return NULL;
 224     }
 225
 226     e820->count = vm->mem_map.e820_count;
 227
 228     list_for_each_entry(entry, &vm->mem_map.e820_list, list) {
 229         e820->entry[i].addr = e->addr;
 230         e820->entry[i].size = e->size;
 231         e820->entry[i].type = e->type;
 232         ++i;
 233     }
 234
 235     return e820;
 236 }
 237 */
 238
 239 int v3_fw_cfg_init(struct v3_vm_info * vm) {
 240
 241
 242
 243     struct v3_fw_cfg_state * cfg_state = &(vm->fw_cfg_state);
 244     int ret = 0;
 245
 246
 247     /*
 248        struct e820_table * e820 = e820_populate(vm);
 249
 250        if (e820 == NULL) {
 251         PrintError(vm, VCORE_NONE, "Failed to populate E820 for FW interface!\n");
 252         return -1;
 253         }
 254
 255     */
 256
 257
 258     ret |= v3_hook_io_port(vm, FW_CFG_CTL_PORT, fw_cfg_ctl_read, &fw_cfg_ctl_write, cfg_state);
 259     ret |= v3_hook_io_port(vm, FW_CFG_DATA_PORT, fw_cfg_data_read, &fw_cfg_data_write, cfg_state);
 260
 261     if (ret != 0) {
 262         //  V3_Free(e820);
 263         PrintError(vm, VCORE_NONE, "Failed to hook FW CFG ports!\n");
 264         return -1;
 265     }
 266
 267     fw_cfg_add_bytes(cfg_state, FW_CFG_SIGNATURE, (uint8_t *)"QEMU", 4);
 268     //fw_cfg_add_bytes(cfg_state, FW_CFG_UUID, qemu_uuid, 16);
 269     fw_cfg_add_i16(cfg_state, FW_CFG_NOGRAPHIC, /*(uint16_t)(display_type == DT_NOGRAPHIC)*/ 0);
 270     fw_cfg_add_i16(cfg_state, FW_CFG_NB_CPUS, (uint16_t)vm->num_cores);
 271     fw_cfg_add_i16(cfg_state, FW_CFG_MAX_CPUS, (uint16_t)vm->num_cores);
 272     fw_cfg_add_i16(cfg_state, FW_CFG_BOOT_MENU, (uint16_t)1);
 273     //fw_cfg_bootsplash(cfg_state);
 274
 275     fw_cfg_add_i32(cfg_state, FW_CFG_ID, 1);
 276     fw_cfg_add_i64(cfg_state, FW_CFG_RAM_SIZE, (uint64_t)vm->mem_size / (1024 * 1024));
 277
 278     //fw_cfg_add_bytes(cfg_state, FW_CFG_ACPI_TABLES, (uint8_t *)acpi_tables,
 279     //       acpi_tables_len);
 280
 281     fw_cfg_add_i32(cfg_state, FW_CFG_IRQ0_OVERRIDE, 1);
 282
 283     /*
 284       smbios_table = smbios_get_table(&smbios_len);
 285
 286       if (smbios_table) {
 287            fw_cfg_add_bytes(cfg_state, FW_CFG_SMBIOS_ENTRIES,
 288                             smbios_table, smbios_len);
 289       }
 290
 291       fw_cfg_add_bytes(cfg_state, FW_CFG_E820_TABLE, (uint8_t *)e820,
 292                      sizeof(struct e820_table));
 293
 294       fw_cfg_add_bytes(cfg_state, FW_CFG_HPET, (uint8_t *)&hpet_cfg,
 295                      sizeof(struct hpet_fw_config));
 296     */
 297
 298
 299
 300     /* NUMA layout */
 301     {
 302         v3_cfg_tree_t * layout_cfg = v3_cfg_subtree(vm->cfg_data->cfg, "mem_layout");
 303         char * num_nodes_str = v3_cfg_val(layout_cfg, "vnodes");
 304         int num_nodes = 0;
 305
 306         /* locations in fw_cfg NUMA array for each info region. */
 307         int node_offset = 0;
 308         int core_offset = 1;
 309         int mem_offset = 1 + vm->num_cores;
 310
 311         if (num_nodes_str) {
 312             num_nodes = atoi(num_nodes_str);
 313         }
 314
 315         if (num_nodes > 0) {
 316             uint64_t * numa_fw_cfg = NULL;
 317             int i = 0;
 318
 319             // Allocate the global NUMA configuration array
 320             numa_fw_cfg = V3_Malloc((1 + vm->num_cores + num_nodes) * sizeof(uint64_t));
 321
 322             if (numa_fw_cfg == NULL) {
 323                 PrintError(vm, VCORE_NONE, "Could not allocate fw_cfg NUMA config space\n");
 324                 return -1;
 325             }
 326
 327             memset(numa_fw_cfg, 0, (1 + vm->num_cores + num_nodes) * sizeof(uint64_t));
 328
 329             // First 8 bytes is the number of NUMA zones
 330             numa_fw_cfg[node_offset] = num_nodes;
 331
 332
 333             // Next region is array of core->node mappings
 334             for (i = 0; i < vm->num_cores; i++) {
 335                 char * vnode_str = v3_cfg_val(vm->cores[i].core_cfg_data, "vnode");
 336
 337                 if (vnode_str == NULL) {
 338                     // if no cpu was specified then NUMA layout is randomized, and we're screwed...
 339                     numa_fw_cfg[core_offset + i] = 0;
 340                 } else {
 341                     numa_fw_cfg[core_offset + i] = (uint64_t)atoi(vnode_str);
 342                 }
 343             }
 344
 345
 346
 347             /* Final region is an array of node->mem_size mappings
 348              * this assumes that memory is assigned to NUMA nodes in consecutive AND contiguous blocks
 349              * NO INTERLEAVING ALLOWED
 350              * e.g. node 0 points to the first x bytes of memory, node 1 points to the next y bytes, etc
 351              *     The array only stores the x,y,... values, indexed by the node ID
 352              *     We should probably fix this, but that will require modifications to SEABIOS
 353              *
 354              *
 355              * For now we will assume that the xml data is set accordingly, so we will just walk through the mem regions specified there.
 356              *   NOTE: This will overwrite configurations if multiple xml regions are defined for each node
 357              */
 358
 359             {
 360                 v3_cfg_tree_t * region_desc = v3_cfg_subtree(layout_cfg, "region");
 361
 362                 while (region_desc) {
 363                     char * start_addr_str = v3_cfg_val(region_desc, "start_addr");
 364                     char * end_addr_str = v3_cfg_val(region_desc, "end_addr");
 365                     char * vnode_id_str = v3_cfg_val(region_desc, "vnode");
 366
 367                     addr_t start_addr = 0;
 368                     addr_t end_addr = 0;
 369                     int vnode_id = 0;
 370
 371                     if ((!start_addr_str) || (!end_addr_str) || (!vnode_id_str)) {
 372                         PrintError(vm, VCORE_NONE, "Invalid memory layout in configuration\n");
 373                         V3_Free(numa_fw_cfg);
 374                         return -1;
 375                     }
 376
 377                     start_addr = atox(start_addr_str);
 378                     end_addr = atox(end_addr_str);
 379                     vnode_id = atoi(vnode_id_str);
 380
 381                     numa_fw_cfg[mem_offset + vnode_id] = end_addr - start_addr;
 382
 383                     region_desc = v3_cfg_next_branch(region_desc);
 384                 }
 385             }
 386
 387
 388             /* Print the NUMA mapping being passed in */
 389             {
 390                 uint64_t region_start = 0;
 391
 392                 V3_Print(vm, VCORE_NONE, "NUMA CONFIG: (nodes=%llu)\n", numa_fw_cfg[0]);
 393
 394                 for (i = 0; i < vm->num_cores; i++) {
 395                     V3_Print(vm, VCORE_NONE, "\tCore %d -> Node %llu\n", i, numa_fw_cfg[core_offset + i]);
 396                 }
 397
 398                 for (i = 0; i < num_nodes; i++) {
 399                     V3_Print(vm, VCORE_NONE, "\tMem (%p - %p) -> Node %d\n", (void *)region_start,
 400                              (void *)numa_fw_cfg[mem_offset + i], i);
 401
 402                     region_start += numa_fw_cfg[mem_offset + i];
 403                 }
 404             }
 405
 406
 407             // Register the NUMA cfg array with the FW_CFG interface
 408             fw_cfg_add_bytes(cfg_state, FW_CFG_NUMA, (uint8_t *)numa_fw_cfg,
 409                              (1 + vm->num_cores + num_nodes) * sizeof(uint64_t));
 410
 411         }
 412     }
 413
 414
 415     return 0;
 416 }
 417
 418 void v3_fw_cfg_deinit(struct v3_vm_info *vm) {
 419     struct v3_fw_cfg_state * cfg_state = &(vm->fw_cfg_state);
 420     int i, j;
 421
 422     for (i = 0; i < 2; ++i) {
 423         for (j = 0; j < FW_CFG_MAX_ENTRY; ++j) {
 424             if (cfg_state->entries[i][j].data != NULL)
 425                 V3_Free(cfg_state->entries[i][j].data);
 426         }
 427     }
 428 }
 429
 430
 431
 432
 433 /* E820 code for HVM enabled bochs bios:  */
 434 #if 0
 435 /* E820 location in HVM virtual address space. Taken from VMXASSIST. */
 436 #define HVM_E820_PAGE        0x00090000
 437 #define HVM_E820_NR_OFFSET   0x000001E8
 438 #define HVM_E820_OFFSET      0x000002D0
 439     // Copy E820 to BIOS. See rombios.c, copy_e820_table function.
 440     addr_t e820_ptr = (addr_t)V3_VAddr((void *)(vm->mem_map.base_region.host_addr + HVM_E820_PAGE));
 441
 442     *(uint16_t *)(e820_ptr + HVM_E820_NR_OFFSET) = e820->count;
 443     memcpy((void *)(e820_ptr + HVM_E820_OFFSET), &e820->entry[0], sizeof(e820->entry[0]) * e820->count);
 444     V3_Free(e820);
 445
 446     return 0;
 447 #endif