From: Yang Yang and Weixiao Fu Date: Wed, 1 Apr 2015 20:57:51 +0000 (-0500) Subject: QCOW2 block storage backend for Palacios X-Git-Url: http://v3vee.org/palacios/gitweb/gitweb.cgi?p=palacios.git;a=commitdiff_plain;h=a2c42ecf5c65f014a1a22b6a3fc3548482c4ec22 QCOW2 block storage backend for Palacios This commit adds a new backend, qcowdisk, that allows the use of QCOW2 format files as the storage for virtual hard drives, cds, virtio block devices, etc. - Backing stores are supported - Snapshots are not supported - Encryption is not supported --- diff --git a/palacios/src/devices/Kconfig b/palacios/src/devices/Kconfig index da55c11..2654818 100644 --- a/palacios/src/devices/Kconfig +++ b/palacios/src/devices/Kconfig @@ -249,6 +249,19 @@ config DEBUG_FILEDISK help Enable debugging for the file based disk backend +config QCOWDISK + bool "QCOWDISK storage backend" + default y + depends on FILE && (IDE || LINUX_VIRTIO_BLOCK) + help + Includes the QEMU QCOW2 file-based disk backend + +config DEBUG_QCOWDISK + bool "QCOWDISK backend debugging" + depends on QCOWDISK && DEBUG_ON + help + Enable debugging for the QEMU QCOW2-based disk backend + config NETDISK bool "NETDISK storage backend" default y diff --git a/palacios/src/devices/Makefile b/palacios/src/devices/Makefile index e6a822a..035bdf5 100644 --- a/palacios/src/devices/Makefile +++ b/palacios/src/devices/Makefile @@ -37,6 +37,7 @@ obj-$(V3_CONFIG_TMPDISK) += tmpdisk.o obj-$(V3_CONFIG_RAMDISK) += ramdisk.o obj-$(V3_CONFIG_NETDISK) += netdisk.o obj-$(V3_CONFIG_FILEDISK) += filedisk.o +obj-$(V3_CONFIG_QCOWDISK) += qcowdisk.o obj-$(V3_CONFIG_CGA) += cga.o obj-$(V3_CONFIG_TELNET_CONSOLE) += telnet_cons.o diff --git a/palacios/src/devices/qcowdisk.c b/palacios/src/devices/qcowdisk.c new file mode 100644 index 0000000..910e9bd --- /dev/null +++ b/palacios/src/devices/qcowdisk.c @@ -0,0 +1,1088 @@ +/* + * This file is part of the Palacios Virtual Machine Monitor developed + * by the V3VEE Project with funding from the United States National + * Science Foundation and the Department of Energy. + * + * The V3VEE Project is a joint project between Northwestern University + * and the University of New Mexico. You can find out more at + * http://www.v3vee.org + * + * Copyright (c) 2008, The V3VEE Project + * All rights reserved. + * + * Author: Yang Yang + * Weixiao Fu + * + * This is free software. You are permitted to use, + * redistribute, and modify it as specified in the file "V3VEE_LICENSE". + */ + +#include +#include + +#include +#include + +#ifndef V3_CONFIG_DEBUG_QCOWDISK +#undef PrintDebug +#define PrintDebug(fmt, args...) +#endif + +#define V3_PACKED __attribute__((packed)) +#define QCOW2_MAGIC (('Q'<<24) | ('F'<<16) | ('I'<<8) | (0xfb)) + +#define QCOW2_COPIED (1ULL<<63) +#define QCOW2_COMPRESSED (1ULL<<62) +#define INIT_BUFF_SIZE (512) + +#define ERROR(...) PrintError(VM_NONE,VCORE_NONE,"qcow2: " __VA_ARGS__) +#define DEBUG(...) PrintDebug(VM_NONE,VCORE_NONE,"qcow2: " __VA_ARGS__) +#define INFO(...) V3_Print(VM_NONE,VCORE_NONE,"qcow2: " __VA_ARGS__) + + +// the header structure for QCOW2 +typedef struct v3_qcow2_header { + uint32_t magic; + uint32_t version; + + uint64_t backing_file_offset; + uint32_t backing_file_size; + + uint32_t cluster_bits; + uint64_t size; + + uint32_t crypt_method; + + uint32_t l1_size; + uint64_t l1_table_offset; + + uint64_t refcount_table_offset; + uint32_t refcount_table_clusters; + + uint32_t nb_snapshots; + uint64_t snapshots_offset; + +} V3_PACKED v3_qcow2_header_t; + +// the header structure for each QCOW2 snapshot +typedef struct v3_qcow2_snapshot_header { + uint64_t l1_table_offset; + uint32_t l1_size; + + uint16_t id_str_size; + uint16_t name_size; + + uint32_t date_sec; + uint32_t date_nsec; + + uint64_t vm_clock_nsec; + uint32_t vm_state_size; + uint32_t extra_data_size; +} V3_PACKED v3_qcow2_snapshot_header_t; + +// the private structure used by QCOW2 implementation +typedef struct v3_qcow2 { + v3_file_t fd; + struct v3_qcow2 *backing_qcow2; + char *backing_file_name; + uint64_t cluster_size; + uint32_t l1_bits; + uint64_t l1_mask; + uint32_t l2_bits; + uint64_t l2_mask; + uint32_t refcount_block_bits; + uint64_t refcount_block_mask; + uint32_t refcount_table_bits; + uint64_t refcount_table_mask; + uint64_t free_cluster_index; + v3_qcow2_header_t header; +} v3_qcow2_t; + +typedef struct v3_qcow2_table_entry { + uint64_t offset: 62; + uint8_t compressed: 1; + uint8_t copied: 1; +} v3_qcow2_table_entry_t; + + +// our implementations for Big/Little Endian conversion +static inline uint16_t be16toh(uint16_t v) +{ + return ((v&0xff)<<8) | ((v&0xff00)>>8); +} + +static inline uint32_t be32toh(uint32_t v) +{ + return (((uint32_t)be16toh(v&0x0000ffffU))<<16) | (uint32_t)be16toh((v&0xffff0000U)>>16); +} + +static inline uint64_t be64toh(uint64_t v) +{ + return (((uint64_t)be32toh(v&0x00000000ffffffffU))<<32) | (uint64_t)be32toh((v&0xffffffff00000000U)>>32); +} + +static inline uint16_t htobe16(uint16_t v) +{ + return be16toh(v); +} + +static inline uint64_t htobe64(uint64_t v) +{ + return be64toh(v); +} + +uint64_t v3_qcow2_get_capacity(v3_qcow2_t *pf) +{ + return pf ? pf->header.size : 0; +} + +static inline uint64_t v3_qcow2_get_cluster_index(v3_qcow2_t *pf, uint64_t file_pos) +{ + if (!pf) { + return 0; + } + return file_pos >> pf->header.cluster_bits; +} + +static int v3_qcow2_get_refcount(v3_qcow2_t *pf, uint64_t idx) +{ + int res = -1, ret = 0; + uint16_t val = 0; + uint64_t table_idx = 0, block_idx = 0, block_offset = 0; + + if (!pf) { + return res; + } + + block_idx = idx & pf->refcount_block_mask; + idx >>= pf->refcount_block_bits; + table_idx = idx & pf->refcount_table_mask; + + ret = v3_file_read(pf->fd, (uint8_t*)&block_offset, sizeof(uint64_t), pf->header.refcount_table_offset + table_idx * sizeof(uint64_t)); + + // FIXME: how to deal with the wrong position + if (ret != sizeof(uint64_t)) { + ERROR("read failed\n"); + return 0; + } + + block_offset = be64toh(block_offset); + + // if cluster is not yet allocated, return 0 + if (!block_offset) { + return 0; + } + + ret = v3_file_read(pf->fd, (uint8_t*)&val, sizeof(uint16_t), block_offset + block_idx * sizeof(uint16_t)); + + if (ret != sizeof(uint16_t)) { + ERROR("read failed\n"); + return 0; + } + + val = be16toh(val); + + return val; +} + +/* + * this function is a wrapper of v3_qcow2_get_refcount + * takes file offset and returns the reference count + * it is commented to avoid compile warning + * + */ +__attribute__((unused)) +static int v3_qcow2_get_refcount_by_file_position(v3_qcow2_t *pf, uint64_t file_pos) +{ + int res = -1; + uint64_t idx = 0; + + if (!pf) { + return res; + } + idx = v3_qcow2_get_cluster_index(pf, file_pos); + + return v3_qcow2_get_refcount(pf, idx); +} + + +/* + * to allocate the contiguous clusters + * return the cluster index in the QCOW2 file + * return positive if successfully, otherwise zero(0) + */ +static uint64_t v3_qcow2_alloc_clusters(v3_qcow2_t *pf, uint32_t nb_clusters) +{ + uint32_t i; + int refcount = 0; + uint64_t idx = 0, ret_idx = 0; + + if(!nb_clusters) { + return 0; + } + + if(!pf) { + return 0; + } + + /* + * referenced the algorithm from Qemu + */ + retry: + ret_idx = pf->free_cluster_index; + for (i = 0; i < nb_clusters; i++) { + idx = pf->free_cluster_index++; + refcount = v3_qcow2_get_refcount(pf, idx); + if(refcount < 0) { + return 0; + } else if(refcount) { + goto retry; + } + } + return ret_idx; +} + +static int v3_qcow2_addr_split(v3_qcow2_t *qc2, uint64_t addr, uint64_t *l1_idx, uint64_t *l2_idx, uint64_t *offset) +{ + if (!qc2 || !l1_idx || !l2_idx || !offset) { + return -1; + } + + *offset = addr & (qc2->cluster_size - 1); + addr = addr >> qc2->header.cluster_bits; + *l2_idx = addr & qc2->l2_mask; + addr = addr >> qc2->l2_bits; + *l1_idx = addr * qc2->l1_mask; + + return 0; +} + +static v3_qcow2_t *v3_qcow2_open(struct v3_vm_info* vm, char *path, int flags) +{ + int ret = 0; + if(!path) { + return NULL; + } + + v3_qcow2_t *res = (v3_qcow2_t*)V3_Malloc(sizeof(v3_qcow2_t)); + + if (!res) { + ERROR("failed to allocate\n"); + goto failed; + } + + memset(res, 0, sizeof(v3_qcow2_t)); + + res->fd = v3_file_open(vm, path, flags); + + if (res->fd < 0) { + ERROR("failed to open underlying file\n"); + goto clean_mem; + } + + ret = v3_file_read(res->fd, (uint8_t*)&res->header, sizeof(res->header), 0); + + if (ret != sizeof(res->header)) { + ERROR("failed to read header\n"); + goto clean_mem; + } + + res->header.magic = be32toh(res->header.magic); + + if (res->header.magic != QCOW2_MAGIC) { + ERROR("wrong magic in header\n"); + goto clean_file; + } +#ifdef __DEBUG__ + else { + DEBUG("right magic\n"); + } +#endif + + res->header.version = be32toh(res->header.version); + + if (res->header.version < 2) { + ERROR("unsupported version: %d\n", res->header.version); + goto clean_file; + } +#ifdef __DEBUG__ + else { + DEBUG("supported version: %d\n", res->header.version); + } +#endif + + res->header.backing_file_offset = be64toh(res->header.backing_file_offset); + res->header.backing_file_size = be32toh(res->header.backing_file_size); + + if (res->header.backing_file_size) { +#ifdef __DEBUG__ + DEBUG("backing file size is larger than zero: %d\n", res->header.backing_file_size); +#endif + + res->backing_file_name = (char*)V3_Malloc(res->header.backing_file_size + 1); + + if (!res->backing_file_name) { + ERROR("failed to allocate memory for backing file name\n"); + goto clean_file; + } + + res->backing_file_name[res->header.backing_file_size] = 0; + + ret = v3_file_read(res->fd, (void*)res->backing_file_name, res->header.backing_file_size, res->header.backing_file_offset); + + if(ret != res->header.backing_file_size) { + ERROR("failed to read backing file name from %s\n", path); + V3_Free(res->backing_file_name); + goto clean_file; + } + + res->backing_qcow2 = v3_qcow2_open(vm, res->backing_file_name, flags); + + if(res->backing_qcow2) { + DEBUG("load backing file successfully\n"); + } else { + ERROR("failed to load backing file, exit\n"); + return NULL; + } + + DEBUG("successfully read the backing file name: %s\n", res->backing_file_name); + + } else { + // no backing file + + res->backing_qcow2 = NULL; + + DEBUG("read no backing file name since size == %d\n", res->header.backing_file_size); + + } + + res->header.cluster_bits = be32toh(res->header.cluster_bits); + res->cluster_size = 1 << res->header.cluster_bits; + res->l2_bits = res->header.cluster_bits - 3; + res->l2_mask = (((uint64_t)1)<l2_bits) - 1; + res->l1_bits = sizeof(uint64_t) * 8 - res->l2_bits - res->header.cluster_bits; + res->l1_mask = (((uint64_t)1)<l1_bits) - 1; + + DEBUG("cluster_bits: %d\n", res->header.cluster_bits); + + res->header.size = be64toh(res->header.size); + + DEBUG("size: %llu\n", res->header.size); + + res->header.crypt_method = be32toh(res->header.crypt_method); + + if (res->header.crypt_method) { + DEBUG("AES encryption\n"); + } else { + DEBUG("no encryption\n"); + } + + res->header.l1_size = be32toh(res->header.l1_size); + res->header.l1_table_offset = be64toh(res->header.l1_table_offset); + + res->header.refcount_table_offset = be64toh(res->header.refcount_table_offset); + res->header.refcount_table_clusters = be32toh(res->header.refcount_table_clusters); + + res->refcount_block_bits = res->header.cluster_bits - 1; + res->refcount_block_mask = (1LL<refcount_block_bits) - 1; + res->refcount_table_bits = 8 * sizeof(uint64_t) - res->refcount_block_bits; + res->refcount_table_mask = (1LL<refcount_table_bits) - 1; + + res->header.nb_snapshots = be32toh(res->header.nb_snapshots); + res->header.snapshots_offset = be64toh(res->header.snapshots_offset); + + + DEBUG("l1 size: %d\n", res->header.l1_size); + DEBUG("l1 table offset: %llu\n", res->header.l1_table_offset); + + DEBUG("refcount_table_offset: %llu\n", res->header.refcount_table_offset); + DEBUG("refcount_table_clusters: %d\n", res->header.refcount_table_clusters); + + DEBUG("nb_snapshots: %d\n", res->header.nb_snapshots); + DEBUG("snapshots_offset: %llu\n", res->header.snapshots_offset); + + res->free_cluster_index = 1; + + // TODO: initialize the free cluster index to a reasonable value + while (1) { + if (v3_qcow2_get_refcount(res, res->free_cluster_index)) { + res->free_cluster_index++; + } else { + break; + } + } + + + return res; + +clean_file: + v3_file_close(res->fd); +clean_mem: + V3_Free(res); +failed: + return NULL; +} + +static void v3_qcow2_close(v3_qcow2_t *pf) +{ + if(!pf) { + return; + } + + v3_file_close(pf->fd); + + if (pf->backing_file_name) { + V3_Free(pf->backing_file_name); + } + + if (pf->backing_qcow2) { + v3_qcow2_close(pf->backing_qcow2); + } + + V3_Free(pf); +} + +static uint64_t v3_qcow2_get_cluster_offset(v3_qcow2_t *qc, uint64_t l1_idx, uint64_t l2_idx, uint64_t offset) +{ + uint64_t res = 0; + uint64_t l1_val = 0, l2_val = 0; + v3_qcow2_table_entry_t *ent = NULL; + int ret = 0; + + if (!qc) { + goto done; + } + + if (l1_idx >= qc->header.l1_size) { + return 0ULL; + } + + ret = v3_file_read(qc->fd, (void*)&l1_val, sizeof(uint64_t), l1_idx * sizeof(uint64_t) + qc->header.l1_table_offset); + + if (ret != sizeof(uint64_t)) { + ERROR("Failed to read L1\n"); + goto done; + } + + l1_val = be64toh(l1_val); + ent = (v3_qcow2_table_entry_t*)&l1_val; + + if (!ent->offset) { + goto done; + } + + ret = v3_file_read(qc->fd, (void*)&l2_val, sizeof(uint64_t), l2_idx * sizeof(uint64_t) + ent->offset); + + if (ret != sizeof(uint64_t)) { + ERROR("Failed to read L2\n"); + goto done; + } + + l2_val = be64toh(l2_val); + ent = (v3_qcow2_table_entry_t*)&l2_val; + res = ent->offset; + +done: + return res; +} + + +static int v3_qcow2_read_cluster(v3_qcow2_t *pf, uint8_t *buff, uint64_t pos, int len) +{ + int ret = 0; + uint64_t l1_idx = 0, l2_idx = 0, offset = 0; + uint64_t file_offset = 0; + + if(!pf || !buff || !len) { + return -1; + } + + ret = v3_qcow2_addr_split(pf, pos, &l1_idx, &l2_idx, &offset); + + if (ret) { + ERROR("failed to split address\n"); + return -1; + } + + file_offset = v3_qcow2_get_cluster_offset(pf, l1_idx, l2_idx, offset); + + if (file_offset) { + + ret = v3_file_read(pf->fd, buff, len, file_offset + (pos & (pf->cluster_size - 1))); + + // it is possible to get a negative value because of the hole + if (ret < 0) { + return -1; + } + + } else if(pf->backing_qcow2) { + return v3_qcow2_read_cluster(pf->backing_qcow2, buff, pos, len); + } else { + memset(buff, 0, len); + } + + return 0; +} + +static int v3_qcow2_read(v3_qcow2_t *pf, uint8_t *buff, uint64_t pos, int len) +{ + if(!pf || !buff || !len) { + return -1; + } + + uint64_t next_addr, cur_len; + int ret = 0; + + while (len) { + next_addr = (pos + pf->cluster_size) & ~(pf->cluster_size - 1); + cur_len = next_addr - pos; + cur_len = cur_len < len ? cur_len : len; + //DEBUG("pos=%lu, len=%lu\n", pos, cur_len); + ret = v3_qcow2_read_cluster(pf, buff, pos, cur_len); + if (ret) { + return -1; + } + buff += cur_len; + pos += cur_len; + len -= cur_len; + } + + return 0; +} + +// in this function, we assmue we must have the corresponding refcount block +// so we will not allocate the refcount block here +static int v3_qcow2_update_refcount(v3_qcow2_t *pf, uint64_t cluster_idx, int count) +{ + uint64_t table_idx = 0, block_idx = 0, block_offset = 0, idx = cluster_idx; + int ret = 0; + uint16_t val = count; + + if (!pf) { + return -1; + } + + block_idx = idx & pf->refcount_block_mask; + idx >>= pf->refcount_block_bits; + table_idx = idx & pf->refcount_table_mask; + + ret = v3_file_read(pf->fd, (uint8_t*)&block_offset, sizeof(uint64_t), pf->header.refcount_table_offset + table_idx * sizeof(uint64_t)); + + if (ret != sizeof(uint64_t) || !block_offset) { + ERROR("something wrong with update refcount, exit\n"); + return -1; + } + + block_offset = be64toh(block_offset); + val = htobe16(val); + + ret = v3_file_write(pf->fd, (uint8_t*)&val, sizeof(uint16_t), block_offset + block_idx * sizeof(uint16_t)); + + if (ret != sizeof(uint16_t)) { + ERROR("write failed when update refcount, exit\n"); + return -1; + } + + return 0; +} + +// in this function, we need to resolve the circular dependency when the refcount itself is not allocated +// since for cluster_bits==16, it needs 65536G to use more than one cluster to contain all the refcount +// table, we don't handle that case +// of course, we can handle this case if we have enough time +static int v3_qcow2_alloc_refcount(v3_qcow2_t *pf, uint64_t cluster_idx) +{ + int res = -1, ret; + uint8_t zero_buff[INIT_BUFF_SIZE]; + uint16_t val = 0; + uint64_t idx = cluster_idx, table_idx = 0, block_idx = 0, block_offset = 0; + uint64_t new_cluster_idx, new_table_idx = 0, new_block_idx = 0, write_value; + uint64_t left_size, start_offset, buf_length; + + if (!pf) { + return -1; + } + + block_idx = idx & pf->refcount_block_mask; + idx >>= pf->refcount_block_bits; + table_idx = idx & pf->refcount_table_mask; + + // TODO: re-allocate larger refcount table if needed + +retry: + ret = v3_file_read(pf->fd, (uint8_t*)&block_offset, sizeof(uint64_t), pf->header.refcount_table_offset + table_idx * sizeof(uint64_t)); + + if (ret != sizeof(uint64_t) || table_idx > pf->header.refcount_table_clusters) { + ERROR("read failed, exit!\n"); + return -1; + } + + block_offset = be64toh(block_offset); + + if (!block_offset) { + // allocate a cluster as a new refcount block + // and also initialize this cluster with zeros + + new_cluster_idx = v3_qcow2_alloc_clusters(pf, 1); + + if (new_cluster_idx <= 0) { + ERROR("failed to allocate new cluster, exit!\n"); + return -1; + } + + idx = new_cluster_idx; + new_block_idx = idx & pf->refcount_block_mask; + idx >>= pf->refcount_block_bits; + new_table_idx = idx & pf->refcount_table_mask; + + // initialize with zeros + start_offset = new_cluster_idx << pf->header.cluster_bits; + left_size = pf->cluster_size; + memset(zero_buff, 0, INIT_BUFF_SIZE); + + while (left_size > 0) { + + buf_length = INIT_BUFF_SIZE < left_size ? INIT_BUFF_SIZE : left_size; + + ret = v3_file_write(pf->fd, zero_buff, buf_length, start_offset); + + if (ret != buf_length) { + ERROR("something wrong with write, exit\n"); + return -1; + } + + start_offset += buf_length; + + left_size -= buf_length; + + } + + // update the refcount table with the new refcount block + + write_value = htobe64(new_table_idx << pf->header.cluster_bits); + + ret = v3_file_write(pf->fd, (uint8_t*)&write_value, sizeof(uint64_t), pf->header.refcount_table_offset + table_idx * sizeof(uint64_t)); + + if (ret != sizeof(uint64_t) ) { + ERROR("write of data failed\n"); + return -1; + } + + if (new_table_idx == table_idx) { + // in the same refcount block, increase its refcount here + val = htobe16(1); + + ret = v3_file_write(pf->fd, (uint8_t*)&val, sizeof(uint16_t), (new_table_idx << pf->header.cluster_bits) + sizeof(uint16_t) * new_block_idx); + + if (ret != sizeof(uint16_t)) { + ERROR("write failed\n"); + return -1; + } + + } else { + v3_qcow2_alloc_refcount(pf, new_cluster_idx); + v3_qcow2_update_refcount(pf, new_cluster_idx, 1); + } + + goto retry; + } + + res = 0; + return res; +} + + + +static int v3_qcow2_increase_refcount(v3_qcow2_t *pf, uint64_t cluster_idx) +{ + int refcount = 0; + + if (!pf) { + return -1; + } + + refcount = v3_qcow2_get_refcount(pf, cluster_idx); + + if (refcount <= 0) { + // execute to here means that no cluster block entry is allocated + // we need to allocate the entry here + refcount = v3_qcow2_alloc_refcount(pf, cluster_idx); + + if (refcount) { + ERROR("something wrong when allocate refcount entry, exit!\n"); + return -1; + } + + refcount = 1; + + } else { + + refcount++; + + } + + // write the refcount back to the file + return v3_qcow2_update_refcount(pf, cluster_idx, refcount); +} + +/* + * this function is to decrease the reference count of a cluster + * since the snapshot is not implemented, + */ +__attribute__ ((unused)) +static int v3_qcow2_decrease_refcount(v3_qcow2_t *pf, uint64_t cluster_idx) +{ + int refcount = 0; + + if (!pf) { + return -1; + } + + refcount = v3_qcow2_get_refcount(pf, cluster_idx); + + if(refcount <= 0) { + ERROR("attempt to decrease the refcount for a cluster, exit\n"); + return -1; + } + + refcount--; + + return v3_qcow2_update_refcount(pf, cluster_idx, refcount); +} + + +/* + * do nothing but return if no need to allocate new cluster + * only allocate one cluster if necessary + */ +static uint64_t v3_qcow2_alloc_cluster_offset(v3_qcow2_t *pf, uint64_t pos) +{ + uint64_t res = 0, l1_idx = 0, l2_idx = 0, offset = 0, l2_cluster_offset, done_bytes; + uint64_t l2_cluster_idx; + uint64_t cluster_offset = 0; + int ret = 0; + uint8_t init_buff[INIT_BUFF_SIZE], *data_buff; + + if (!pf) { + return res; + } + + ret = v3_qcow2_addr_split(pf, pos, &l1_idx, &l2_idx, &offset); + + if (ret) { + ERROR("cannot split address\n"); + return res; + } + + // FIXME: in fact, we should check the refcount to be 1, + // otherwise we should copy + // do it later + + res = v3_qcow2_get_cluster_offset(pf, l1_idx, l2_idx, offset); + + if (res) { + cluster_offset = res; + goto done; + } + + /* + * need to allocate a new cluster for write + * also need to update the l1 and l2 table + */ + + ret = v3_file_read(pf->fd, (uint8_t*)&l2_cluster_offset, sizeof(uint64_t), pf->header.l1_table_offset + sizeof(uint64_t) * l1_idx); + + if (ret != sizeof(uint64_t)) { + ERROR("read failed\n"); + return res; + } + + l2_cluster_offset = be64toh(l2_cluster_offset) & ~(QCOW2_COPIED | QCOW2_COMPRESSED); + + if (!l2_cluster_offset/*l1_idx >= pf->header.l1_size*/) { // huh? + /* + * need to allocate a new l1 entry + * for simplicity, only allow 2^(cluster_bits-3) entry in l1 table + */ + + l2_cluster_idx = v3_qcow2_alloc_clusters(pf, 1); + l2_cluster_offset = (l2_cluster_idx << pf->header.cluster_bits); + + // increase the reference count for this cluster + v3_qcow2_increase_refcount(pf, l2_cluster_idx); + + memset(init_buff, 0, INIT_BUFF_SIZE); + + for (done_bytes = 0; done_bytes < pf->cluster_size; done_bytes += INIT_BUFF_SIZE) { + + ret = v3_file_write(pf->fd, init_buff, INIT_BUFF_SIZE, l2_cluster_offset + done_bytes); + + if (ret != INIT_BUFF_SIZE) { + ERROR("write failed\n"); + return res; + } + } + + /* + * set the copied bit + */ + l2_cluster_offset |= QCOW2_COPIED; + l2_cluster_offset = htobe64(l2_cluster_offset); + + ret = v3_file_write(pf->fd, (uint8_t*)&l2_cluster_offset, sizeof(uint64_t), pf->header.l1_table_offset + sizeof(uint64_t) * l1_idx); + + if (ret != sizeof(uint64_t)) { + ERROR("write failed\n"); + return res; + } + + } + + ret = v3_file_read(pf->fd, (uint8_t*)&l2_cluster_offset, sizeof(uint64_t), pf->header.l1_table_offset + sizeof(uint64_t) * l1_idx); + + if (ret != sizeof(uint64_t)) { + ERROR("read failed\n"); + return res; + } + + l2_cluster_offset = be64toh(l2_cluster_offset) & ~(QCOW2_COPIED | QCOW2_COMPRESSED); + + /* + * begin to retrieve cluster_offset + */ + + // adjust the l2_cluster_offset to the right entry address + l2_cluster_offset += sizeof(uint64_t) * l2_idx; + + ret = v3_file_read(pf->fd, (uint8_t*)&cluster_offset, sizeof(uint64_t), l2_cluster_offset); + + if (ret!=sizeof(uint64_t)) { + ERROR("read failed\n"); + return res; + } + + cluster_offset = be64toh(cluster_offset) & ~(QCOW2_COPIED | QCOW2_COMPRESSED); + + if (!cluster_offset) { + /* + * if the cluster_offset is not allocated + */ + l2_cluster_idx = v3_qcow2_alloc_clusters(pf, 1); + cluster_offset = (l2_cluster_idx << pf->header.cluster_bits); + + // TODO: initialization + // initialize the cluster with the original data + data_buff = (uint8_t*)V3_Malloc(pf->cluster_size); + + if (data_buff) { + + pos = (pos >> pf->header.cluster_bits) << pf->header.cluster_bits; + + v3_qcow2_read_cluster(pf, data_buff, pos, pf->cluster_size); + + ret = v3_file_write(pf->fd, data_buff, pf->cluster_size, cluster_offset); + + if (ret!=pf->cluster_size) { + ERROR("write failed\n"); + return res; + } + + V3_Free(data_buff); + + } else { + + ERROR("failed to initialize the original data\n"); + + } + + offset = htobe64(cluster_offset | QCOW2_COPIED); + + ret = v3_file_write(pf->fd, (uint8_t*)&offset, sizeof(uint64_t), l2_cluster_offset); + + if (ret!=sizeof(uint64_t)) { + ERROR("write failed\n"); + return res; + } + + v3_qcow2_increase_refcount(pf, l2_cluster_idx); + + } + + done: + return cluster_offset; +} + +static int v3_qcow2_write_cluster(v3_qcow2_t *pf, uint8_t *buff, uint64_t pos, int len) +{ + if(!pf || !buff) { + return -1; + } + + uint64_t cluster_addr, cluster_offset; + int ret = 0; + + cluster_addr = v3_qcow2_alloc_cluster_offset(pf, pos); + + if (!cluster_addr) { + ERROR("zero cluster address\n"); + return -1; + } + + cluster_offset = pos & (pf->cluster_size - 1); + + ret = v3_file_write(pf->fd, buff, len, cluster_addr + cluster_offset); + + if (ret != len) { + ERROR("write failed\n"); + return -1; + } + + return 0; +} + +static int v3_qcow2_write(v3_qcow2_t *pf, uint8_t *buff, uint64_t pos, int len) +{ + if (!pf || !buff || !len) { + return -1; + } + + uint64_t next_addr, cur_len; + int ret = 0; + + while (len) { + next_addr = (pos + pf->cluster_size) & ~(pf->cluster_size - 1); + cur_len = next_addr - pos; + cur_len = cur_len < len ? cur_len : len; + + DEBUG("pos=%llu, len=%llu\n", pos, cur_len); + + ret = v3_qcow2_write_cluster(pf, buff, pos, cur_len); + + if (ret) { + return -1; + } + + buff += cur_len; + pos += cur_len; + len -= cur_len; + } + + return ret; +} + +static int read(uint8_t * buf, uint64_t lba, uint64_t num_bytes, void * private_data) +{ + v3_qcow2_t * disk = (v3_qcow2_t *) private_data; + + DEBUG("QCOW Reading %llu bytes from %llu to 0x%p\n", num_bytes, lba, buf); + + if (lba + num_bytes > disk->header.size) { + ERROR("Out of bounds read: lba=%llu, num_bytes=%llu, capacity=%llu\n", + lba, num_bytes, disk->header.size); + return -1; + } + + return v3_qcow2_read(disk, buf, lba, num_bytes); +} + + +static int write(uint8_t * buf, uint64_t lba, uint64_t num_bytes, void * private_data) +{ + v3_qcow2_t * disk = (v3_qcow2_t *) private_data; + + DEBUG("QCOW Writing %llu bytes from 0x%p to %llu\n", num_bytes, buf, lba); + + if (lba + num_bytes > disk->header.size) { + ERROR("Out of bounds read: lba=%llu, num_bytes=%llu, capacity=%llu\n", + lba, num_bytes, disk->header.size); + return -1; + } + + return v3_qcow2_write(disk, buf, lba, num_bytes); + +} + + +static uint64_t get_capacity(void * private_data) +{ + v3_qcow2_t * disk = (v3_qcow2_t *)private_data; + + DEBUG("Querying QCOWDISK capacity %llu\n", v3_qcow2_get_capacity(disk)); + + return v3_qcow2_get_capacity(disk); +} + +static struct v3_dev_blk_ops blk_ops = { + .read = read, + .write = write, + .get_capacity = get_capacity, +}; + + + + +static int disk_free(v3_qcow2_t * disk) +{ + v3_qcow2_close(disk); + return 0; +} + +static struct v3_device_ops dev_ops = { + .free = (int (*)(void *))disk_free, +}; + + + + +static int disk_init(struct v3_vm_info * vm, v3_cfg_tree_t * cfg) +{ + v3_qcow2_t * disk = NULL; + char * path = v3_cfg_val(cfg, "path"); + char * dev_id = v3_cfg_val(cfg, "ID"); + char * writable = v3_cfg_val(cfg, "writable"); + char * writeable = v3_cfg_val(cfg, "writeable"); + + v3_cfg_tree_t * frontend_cfg = v3_cfg_subtree(cfg, "frontend"); + int flags = FILE_OPEN_MODE_READ; + + PrintDebug(vm,VCORE_NONE,"Welcome to the QCOWDISK Implementation!\n"); + + if ( ((writable) && (writable[0] == '1')) || + ((writeable) && (writeable[0] == '1')) ) { + flags |= FILE_OPEN_MODE_WRITE; + } + + if (path == NULL) { + PrintError(vm, VCORE_NONE, "Missing path (%s) for %s\n", path, dev_id); + return -1; + } + + disk = v3_qcow2_open(vm, path, flags); + + if (disk == NULL) { + PrintError(vm, VCORE_NONE, "Could not open file disk:%s\n", path); + return -1; + } + + struct vm_device * dev = v3_add_device(vm, dev_id, &dev_ops, disk); + + if (dev == NULL) { + PrintError(vm, VCORE_NONE, "Could not attach device %s\n", dev_id); + V3_Free(disk); + return -1; + } + + if (v3_dev_connect_blk(vm, v3_cfg_val(frontend_cfg, "tag"), + &blk_ops, frontend_cfg, disk) == -1) { + PrintError(vm, VCORE_NONE, "Could not connect %s to frontend %s\n", + dev_id, v3_cfg_val(frontend_cfg, "tag")); + v3_remove_device(dev); + return -1; + } + + + return 0; +} + + +device_register("QCOWDISK", disk_init) diff --git a/v3_config_guest.pl b/v3_config_guest.pl index 35aadc7..5b50f40 100755 --- a/v3_config_guest.pl +++ b/v3_config_guest.pl @@ -278,7 +278,7 @@ print $target "\n"; close(PAL); -print Dumper(\%config); +#print Dumper(\%config); if (defined($config{qemu}) && $config{qemu} eq "y") { gen_qemu_startup(\%config, $pdir, $dir); @@ -847,7 +847,7 @@ sub do_storage { sub do_storage_backend { my ($cr, $pdir, $dir, $name, $frontend, $loc, $frontendblock) = @_; - my ($canramdisk, $canfiledisk, $cannetdisk, $cantmpdisk); + my ($canramdisk, $canfiledisk, $cannetdisk, $cantmpdisk, $canqcowdisk); my @devs=("cd","hd","nothing"); my @disks; my $type; @@ -857,15 +857,17 @@ sub do_storage_backend { $canramdisk = is_palacios_core_feature_enabled($pdir, "V3_CONFIG_RAMDISK"); $canfiledisk = is_palacios_core_feature_enabled($pdir, "V3_CONFIG_FILEDISK"); + $canqcowdisk = is_palacios_core_feature_enabled($pdir, "V3_CONFIG_QCOWDISK"); $cannetdisk = is_palacios_core_feature_enabled($pdir, "V3_CONFIG_NETDISK"); $cantmpdisk = is_palacios_core_feature_enabled($pdir, "V3_CONFIG_TMPDISK"); push @disks, "ramdisk" if $canramdisk; push @disks, "filedisk" if $canramdisk; + push @disks, "qcowdisk" if $canramdisk; push @disks, "netdisk" if $cannetdisk; push @disks, "tmpdisk" if $cantmpdisk; - if (!$canramdisk && !$canfiledisk && !$cannetdisk && !$cantmpdisk) { + if (!$canramdisk && !$canfiledisk && !$cannetdisk && !$cantmpdisk && !$canqcowdisk) { print "You have no storage implementations enabled in your Palacios build, so it is impossible\n"; print "to add anything to storage controller \"$frontend\" location \"$loc\"\n"; return -1; @@ -884,6 +886,7 @@ sub do_storage_backend { print "A storage device requires one of the following implementations\n"; print " * RAMDISK - the data is kept in memory (common) : ".($canramdisk ? "available" : "UNAVAILABLE")."\n"; print " * FILEDISK - the data is kept in a host file (common) : ".($canfiledisk ? "available" : "UNAVAILABLE")."\n"; + print " * QCOWDISK - the data is kept in a host file (qcow) : ".($canqcowdisk ? "available" : "UNAVAILABLE")."\n"; print " * NETDISK - the data is accessed via the network (uncommon) : ".($cannetdisk ? "available" : "UNAVAILABLE")."\n"; print " * TMPDISK - the data is kept in memory and discarded (common) : ".($cantmpdisk ? "available" : "UNAVAILABLE")."\n"; while (1) { @@ -893,7 +896,7 @@ sub do_storage_backend { last if $#test==0; } - if ($type eq "filedisk" || $type eq "ramdisk") { + if ($type eq "filedisk" || $type eq "ramdisk" || $type eq "qcowdisk") { print "$type requires a file (.iso for example). Do you have one? [y] : "; if (get_user("y") eq "y") { while (1) { @@ -930,9 +933,12 @@ sub do_storage_backend { add_device($cr,"RAMDISK","$frontend\_$loc", undef, " $frontend\_$loc\n".$attach); add_file($cr, "$frontend\_$loc", "$frontend\_$loc.dat"); - } else { + } elsif ($type eq "filedisk") { add_device($cr,"FILEDISK","$frontend\_$loc", $what eq "hd" ? "writable=\"1\"" : undef, " $frontend\_$loc.dat\n".$attach); + } else { + add_device($cr,"QCOWDISK","$frontend\_$loc", $what eq "hd" ? "writable=\"1\"" : undef, + " $frontend\_$loc.dat\n".$attach); } last; } else {