2 * This file is part of the Palacios Virtual Machine Monitor developed
3 * by the V3VEE Project with funding from the United States National
4 * Science Foundation and the Department of Energy.
6 * The V3VEE Project is a joint project between Northwestern University
7 * and the University of New Mexico. You can find out more at
10 * Copyright (c) 2008, Jack Lange <jarusl@cs.northwestern.edu>
11 * Copyright (c) 2008, The V3VEE Project <http://www.v3vee.org>
12 * All rights reserved.
14 * Author: Jack Lange <jarusl@cs.northwestern.edu>
16 * This is free software. You are permitted to use,
17 * redistribute, and modify it as specified in the file "V3VEE_LICENSE".
20 #include <palacios/vmm.h>
21 #include <palacios/vmm_dev_mgr.h>
22 #include <palacios/vmm_swapbypass.h>
23 #include <palacios/vm_guest.h>
24 #include <palacios/vmm_hashtable.h>
27 #ifdef V3_CONFIG_SWAPBYPASS_TELEMETRY
28 #include <palacios/vmm_telemetry.h>
33 #define PrintDebug(fmt, ...)
36 /* This is the first page that linux writes to the swap area */
37 /* Taken from Linux */
40 char reserved[PAGE_SIZE - 10];
41 char magic[10]; /* SWAP-SPACE or SWAPSPACE2 */
44 char bootbits[1024]; /* Space for disklabel etc. */
48 unsigned char sws_uuid[16];
49 unsigned char sws_volume[16];
50 uint32_t type; // The index into the swap_map
51 uint32_t padding[116];
60 struct list_head cache_node;
63 // Per instance data structure
68 struct v3_vm_info * vm;
69 struct swap_state * swap_info;
73 union swap_header hdr;
76 uint_t unswapped_pages;
83 struct v3_dev_blk_ops * ops;
86 #ifdef V3_CONFIG_SWAPBYPASS_TELEMETRY
95 uint64_t cache_base_addr;
96 uint_t pages_in_cache;
98 struct cache_entry * entry_map;
99 struct list_head entry_list;
100 struct list_head free_list;
102 struct hashtable * entry_ht;
107 void __udelay(unsigned long usecs);
109 static uint_t cache_hash_fn(addr_t key) {
110 return v3_hash_long(key, 32);
114 static int cache_eq_fn(addr_t key1, addr_t key2) {
115 return (key1 == key2);
122 static inline uint32_t get_swap_index_from_offset(uint32_t offset) {
123 // CAREFUL: The index might be offset by 1, because the first 4K is the header
124 return (offset / 4096);
127 static inline uint32_t get_swap_offset_from_index(uint32_t index) {
128 // CAREFUL: The index might be offset by 1, because the first 4K is the header
129 return (index * 4096);
133 static inline uint32_t get_cache_entry_index(struct swap_state * swap, struct cache_entry * entry) {
134 return (entry - swap->entry_map); // / sizeof(struct cache_entry);
141 static inline void * get_swap_entry(uint32_t pg_index, void * private_data) {
142 struct swap_state * swap = (struct swap_state *)private_data;
143 struct cache_entry * entry = NULL;
144 void * pg_addr = NULL;
145 uint32_t swap_index = pg_index * 4096;
147 if (swap->disabled) {
151 PrintDebug(info->vm_info, info, "Getting swap entry for index %d\n", pg_index);
153 entry = (struct cache_entry *)v3_htable_search(swap->entry_ht, swap_index);
156 uint32_t cache_index = get_cache_entry_index(swap, entry);
157 PrintDebug(info->vm_info, info, "Found cached entry (%d)\n", cache_index);
158 pg_addr = swap->cache + (cache_index * 4096);
166 static int read_disk(uint8_t * buf, uint64_t lba, uint64_t num_bytes, struct swap_state * swap) {
167 if ((swap->io_flag == 0) && (swap->seek_usecs > 0)) {
168 __udelay(swap->seek_usecs);
172 swap->disk_reads += num_bytes / 4096;
173 return swap->ops->read(buf, lba, num_bytes, swap->private_data);
178 static int write_disk(uint8_t * buf, uint64_t lba, uint64_t num_bytes, struct swap_state * swap) {
179 if ((swap->io_flag == 0) && (swap->seek_usecs > 0)) {
180 __udelay(swap->seek_usecs);
184 swap->disk_writes += num_bytes / 4096;
187 return swap->ops->write(buf, lba, num_bytes, swap->private_data);
191 static uint64_t swap_get_capacity(void * private_data) {
192 struct swap_state * swap = (struct swap_state *)private_data;
193 return swap->ops->get_capacity(swap->private_data);
197 static struct v3_swap_ops swap_ops = {
198 .get_swap_entry = get_swap_entry,
203 static int buf_read(uint8_t * buf, uint64_t lba, uint64_t num_bytes, void * private_data) {
204 struct swap_state * swap = (struct swap_state *)private_data;
205 uint32_t offset = lba;
206 uint32_t length = num_bytes;
211 PrintError(VM_NONE, VCORE_NONE, "Swapping in length that is not a page multiple\n");
214 if (swap->disabled) {
215 return read_disk(buf, lba, num_bytes, swap);
219 PrintDebug(VM_NONE, VCORE_NONE, "SymSwap: Reading %d bytes to %p (lba=%p)\n", (uint32_t)num_bytes, buf, (void *)(addr_t)lba);
223 PrintError(VM_NONE, VCORE_NONE, "Swapping in length that is not a page multiple\n");
228 if ((swap->active == 1) && (offset >= 4096)) {
230 int read_pages = (length / 4096);
233 // Notify the shadow paging layer
235 swap->unswapped_pages += (length / 4096);
238 #ifdef V3_CONFIG_SWAPBYPASS_TELEMETRY
239 swap->pages_in += length / 4096;
242 for (i = 0; i < read_pages; i++) {
243 uint32_t swap_index = offset + (i * 4096);
244 uint32_t cache_index = 0;
245 struct cache_entry * entry = NULL;
247 if (swap->symbiotic == 1) {
248 v3_swap_in_notify(swap->vm, get_swap_index_from_offset(offset + i), swap->hdr.info.type);
251 PrintDebug(VM_NONE, VCORE_NONE, "Searching for swap index %d\n", swap_index);
253 entry = (struct cache_entry *)v3_htable_search(swap->entry_ht, (addr_t)swap_index);
257 cache_index = get_cache_entry_index(swap, entry);
259 PrintDebug(VM_NONE, VCORE_NONE, "Reading from cache entry %d\n", cache_index);
261 memcpy(buf, swap->cache + (cache_index * 4096), 4096);
264 PrintDebug(VM_NONE, VCORE_NONE, "Reading from disk offset = %p\n", (void *)(addr_t)offset);
266 if (read_disk(buf, offset, 4096, swap) == -1) {
267 PrintError(VM_NONE, VCORE_NONE, "Error reading disk\n");
276 return read_disk(buf, lba, num_bytes, swap);
284 static int flush_cache(struct swap_state * swap, int num_to_flush) {
287 PrintDebug(VM_NONE, VCORE_NONE, "Flushing %d pages\n", num_to_flush);
289 for (i = 0; i < num_to_flush; i++) {
290 struct cache_entry * entry = NULL;
291 uint32_t entry_index = 0;
293 entry = list_first_entry(&(swap->entry_list), struct cache_entry, cache_node);
295 entry_index = get_cache_entry_index(swap, entry);
296 PrintDebug(VM_NONE, VCORE_NONE, "Flushing cache entry %d\n", entry_index);
298 if (write_disk(swap->cache + (entry_index * 4096), entry->disk_index, 4096, swap) == -1) {
299 PrintError(VM_NONE, VCORE_NONE, "Error in disk write\n");
304 if (swap->symbiotic == 1) {
305 v3_swap_in_notify(swap->vm, entry->disk_index / 4096, swap->hdr.info.type);
308 // invalidate swap entry
311 v3_htable_remove(swap->entry_ht, entry->disk_index, 0);
313 list_move(&(entry->cache_node), &(swap->free_list));
315 swap->pages_in_cache--;
328 static int buf_write(uint8_t * buf, uint64_t lba, uint64_t num_bytes, void * private_data) {
329 struct swap_state * swap = (struct swap_state *)private_data;
330 uint32_t offset = lba;
331 uint32_t length = num_bytes;
337 if (swap->disabled) {
338 return write_disk(buf, lba, num_bytes, swap);
343 PrintDebug(VM_NONE, VCORE_NONE, "SymSwap: Writing %d bytes to %p from %p\n", length,
344 (void *)(swap->swap_space + offset), buf);
348 if ((swap->active == 0) && (offset == 0)) {
349 // This is the swap header page
353 // store a local copy
354 memcpy(&(swap->hdr), buf, sizeof(union swap_header));
357 PrintError(VM_NONE, VCORE_NONE, "Swap Type=%d (magic=%s)\n", swap->hdr.info.type, swap->hdr.magic.magic);
359 if (swap->symbiotic == 1) {
360 if (v3_register_swap_disk(swap->vm, swap->hdr.info.type, &swap_ops, swap) == -1) {
361 PrintError(VM_NONE, VCORE_NONE, "Error registering symbiotic swap disk\n");
365 PrintError(VM_NONE, VCORE_NONE, "Swap disk registered\n");
369 if (write_disk(buf, lba, num_bytes, swap) == -1) {
370 PrintError(VM_NONE, VCORE_NONE, "Error writing swap header to disk\n");
374 PrintDebug(VM_NONE, VCORE_NONE, "Wrote header to disk\n");
379 if ((swap->active == 1) && (offset >= 4096)) {
381 int written_pages = (length / 4096);
382 int avail_space = (swap->cache_size / 4096) - swap->pages_in_cache;
385 swap->swapped_pages += written_pages;
387 #ifdef V3_CONFIG_SWAPBYPASS_TELEMETRY
388 swap->pages_out += length / 4096;
391 PrintDebug(VM_NONE, VCORE_NONE, "available cache space = %d, pages written = %d\n", avail_space, written_pages);
393 if (avail_space < written_pages) {
394 flush_cache(swap, written_pages - avail_space);
398 for (i = 0; i < written_pages; i += 1) {
399 // set_index_usage(swap, get_swap_index_from_offset(offset + i), 1);
400 struct cache_entry * new_entry = NULL;
401 uint32_t swap_index = offset + (i * 4096);
402 uint32_t cache_index = 0;
404 new_entry = (struct cache_entry *)v3_htable_search(swap->entry_ht, (addr_t)swap_index);
406 if (new_entry == NULL) {
407 new_entry = list_tail_entry(&(swap->free_list), struct cache_entry, cache_node);
409 new_entry->disk_index = swap_index;
411 list_move_tail(&(new_entry->cache_node), &(swap->entry_list));
413 v3_htable_insert(swap->entry_ht, (addr_t)swap_index, (addr_t)new_entry);
415 swap->pages_in_cache++;
418 cache_index = get_cache_entry_index(swap, new_entry);
420 PrintDebug(VM_NONE, VCORE_NONE, "Writing to cache entry %d\n", cache_index);
422 memcpy(swap->cache + (cache_index * 4096), buf, 4096);
427 if (write_disk(buf, lba, num_bytes, swap) == -1) {
428 PrintError(VM_NONE, VCORE_NONE, "Error writing swap header to disk\n");
439 static uint8_t write_buf[4096];
442 static int swap_write(uint8_t * buf, uint64_t lba, uint64_t num_bytes, void * private_data) {
443 int idx = lba % 4096;
445 if (num_bytes != 512) {
446 PrintError(VM_NONE, VCORE_NONE, "Write for %d bytes\n", (uint32_t)num_bytes);
451 memcpy(write_buf + idx, buf, num_bytes);
453 if (idx + num_bytes == 4096) {
454 return buf_write(write_buf, lba - idx, 4096, private_data);
462 static uint8_t read_buf[4096];
466 static int swap_read(uint8_t * buf, uint64_t lba, uint64_t num_bytes, void * private_data) {
467 int idx = lba % 4096;
470 if (num_bytes != 512) {
471 PrintError(VM_NONE, VCORE_NONE, "Read for %d bytes\n", (uint32_t)num_bytes);
476 if (buf_read(read_buf, lba - idx, 4096, private_data) == -1) {
477 PrintError(VM_NONE, VCORE_NONE, "Error reading buffer\n");
482 memcpy(buf, read_buf + idx, num_bytes);
488 static int swap_free(void * dev) {
493 static struct v3_dev_blk_ops blk_ops = {
496 .get_capacity = swap_get_capacity,
501 static struct v3_device_ops dev_ops = {
506 #ifdef V3_CONFIG_SWAPBYPASS_TELEMETRY
507 static void telemetry_cb(struct v3_vm_info * vm, void * private_data, char * hdr) {
508 struct swap_state * swap = (struct swap_state *)private_data;
510 V3_Print(vm, VCORE_NONE, "%sSwap Device:\n", hdr);
511 V3_Print(vm, VCORE_NONE, "%s\tPages Swapped in=%d\n", hdr, swap->pages_in);
512 V3_Print(vm, VCORE_NONE, "%s\tPages Swapped out=%d\n", hdr, swap->pages_out);
513 V3_Print(vm, VCORE_NONE, "%s\tPages Written to Disk=%d\n", hdr, swap->disk_writes);
514 V3_Print(vm, VCORE_NONE, "%s\tPages Read from Disk=%d\n", hdr, swap->disk_reads);
519 static int connect_fn(struct v3_vm_info * vm,
520 void * frontend_data,
521 struct v3_dev_blk_ops * ops,
523 void * private_data) {
524 v3_cfg_tree_t * frontend_cfg = v3_cfg_subtree(cfg, "frontend");
525 uint32_t cache_size = atoi(v3_cfg_val(cfg, "cache_size")) * 1024 * 1024;
526 uint32_t seek_us = atoi(v3_cfg_val(cfg, "seek_us"));
527 int symbiotic = atoi(v3_cfg_val(cfg, "symbiotic"));
528 struct swap_state * swap = NULL;
532 PrintError(vm, VCORE_NONE, "Initializing sym swap without a frontend device\n");
536 PrintError(vm, VCORE_NONE, "Creating Swap filter (cache size=%dMB)\n", cache_size / (1024 * 1024));
538 swap = (struct swap_state *)V3_Malloc(sizeof(struct swap_state));
541 PrintError(vm, VCORE_NONE, "Cannot allocate in connect\n");
546 swap->cache_size = cache_size;
548 swap->seek_usecs = seek_us;
549 swap->symbiotic = symbiotic;
552 swap->private_data = private_data;
554 swap->swapped_pages = 0;
555 swap->unswapped_pages = 0;
556 // swap->cached_pages = 0;
558 if (cache_size == 0) {
563 INIT_LIST_HEAD(&(swap->entry_list));
564 INIT_LIST_HEAD(&(swap->free_list));
565 swap->entry_map = (struct cache_entry *)V3_Malloc(sizeof(struct cache_entry) * (cache_size / 4096));
567 if (!swap->entry_map) {
568 PrintError(vm, VCORE_NONE, "Cannot allocate in connect\n");
573 for (i = 0; i < (cache_size / 4096); i++) {
574 list_add(&(swap->entry_map[i].cache_node), &(swap->free_list));
577 swap->entry_ht = v3_create_htable(0, cache_hash_fn, cache_eq_fn);
581 swap->cache_base_addr = (addr_t)V3_AllocPages(swap->cache_size / 4096);
583 if (!swap->cache_base_addr) {
584 PrintError(vm, VCORE_NONE, "Cannot allocate cache space\n");
590 swap->cache = (uint8_t *)V3_VAddr((void *)(addr_t)(swap->cache_base_addr));
591 memset(swap->cache, 0, swap->cache_size);
594 if (v3_dev_connect_blk(vm, v3_cfg_val(frontend_cfg, "tag"),
595 &blk_ops, frontend_cfg, swap) == -1) {
596 PrintError(vm, VCORE_NONE, "Could not connect to frontend %s\n",
597 v3_cfg_val(frontend_cfg, "tag"));
602 #ifdef V3_CONFIG_SWAPBYPASS_TELEMETRY
604 if (vm->enable_telemetry == 1) {
605 v3_add_telemetry_cb(vm, telemetry_cb, swap);
616 static int swap_init(struct v3_vm_info * vm, v3_cfg_tree_t * cfg) {
618 char * dev_id = v3_cfg_val(cfg, "ID");
620 struct vm_device * dev = v3_allocate_device(dev_id, &dev_ops, NULL);
622 if (v3_attach_device(vm, dev) == -1) {
623 PrintError(vm, VCORE_NONE, "Could not attach device %s\n", dev_id);
627 if (v3_dev_add_blk_frontend(vm, dev_id, connect_fn, NULL) == -1) {
628 PrintError(vm, VCORE_NONE, "Could not register %s as block frontend\n", dev_id);
636 device_register("SWAPBYPASS_CACHE2", swap_init)