qemu-kvm 设备虚拟化----I/O 端口和 I/O 内存
操作设备存在两种接口:I/O 端口和 I/O 内存,下面分析虚拟机如何截获和模拟这两种情况的。
1.用户空间访问内核数据结构信息
内存映射可被实现来提供用户程序对设备内存的直接存取,KVM 内核代表每个 VCPU 的 struct kvm_run 数据结构被 mmap用户空间,从而用户空间可以读取 struct kvm_run 中的信息,对于mmio读写操作来说,可以知道
其地址和大小。当然可以读取其他的信息例如struct kvm_coalesced_mmio_ring等。
mmap_size = kvm_ioctl(kvm_state, KVM_GET_VCPU_MMAP_SIZE, 0);
env->kvm_run =mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, env->kvm_fd,0);
static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
struct kvm_vcpu *vcpu = vma->vm_file->private_data;
struct page *page;
if (vmf->pgoff == 0)
page = virt_to_page(vcpu->run);
#ifdef CONFIG_X86
else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET)
page = virt_to_page(vcpu->arch.pio_data);
#endif
#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET)
page = virt_to_page(vcpu->kvm->coalesced_mmio_ring);
#endif
else
return VM_FAULT_SIGBUS;
get_page(page);
vmf->page = page;
return 0;
}
static const struct vm_operations_struct kvm_vcpu_vm_ops = {
.fault = kvm_vcpu_fault,
};
static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
{
vma->vm_ops = &kvm_vcpu_vm_ops;
return 0;
}
2.I/O 内存-mmio的截获和模拟流程
首先用户态对mmio地址的读写函数注册
然后MMIO 会被 KVM 内核截获
最后从KVM 内核返回到用户空间,由 qemu-kvm 来完成 MMIO 读写的模拟。
qemu-kvm 用户态
=================
设备都注册自己特定mmio地址的读写函数(read/write) 函数
iomemtype = cpu_register_io_memory(hpet_ram_read,hpet_ram_write, s);
cpu_register_physical_memory(HPET_BASE, 0x400, iomemtype);
int cpu_register_io_memory(CPUReadMemoryFunc * const *mem_read,
CPUWriteMemoryFunc * const *mem_write,
void *opaque)
{
return cpu_register_io_memory_fixed(0, mem_read, mem_write, opaque);
}
static int cpu_register_io_memory_fixed(int io_index,
CPUReadMemoryFunc * const *mem_read,
CPUWriteMemoryFunc * const *mem_write,
void *opaque)
{
int i, subwidth = 0;
if (io_index <= 0) {
io_index = get_free_io_mem_idx();
if (io_index == -1)
return io_index;
} else {
io_index >>= IO_MEM_SHIFT;
if (io_index >= IO_MEM_NB_ENTRIES)
return -1;
}
for(i = 0;i < 3; i++) {
if (!mem_read[i] || !mem_write[i])
subwidth = IO_MEM_SUBWIDTH;
io_mem_read[io_index][i] = mem_read[i];
io_mem_write[io_index][i] = mem_write[i];
}
io_mem_opaque[io_index] = opaque;
return (io_index << IO_MEM_SHIFT) | subwidth;
}
int kvm_run(CPUState *env)
{
case KVM_EXIT_MMIO:
r = handle_mmio(env);
}
static int handle_mmio(CPUState *env)
{
unsigned long addr = env->kvm_run->mmio.phys_addr;
struct kvm_run *kvm_run = env->kvm_run;
void *data = kvm_run->mmio.data;
/* hack: Red Hat 7.1 generates these weird accesses. */
if ((addr > 0xa0000 - 4 && addr <= 0xa0000) && kvm_run->mmio.len == 3)
return 0;
cpu_physical_memory_rw(addr, data, kvm_run->mmio.len, kvm_run->mmio.is_write);
return 0;
}
void cpu_physical_memory_rw(target_phys_addr_t addr, uint8_t *buf,
int len, int is_write)
{
int l, io_index;
uint8_t *ptr;
uint32_t val;
target_phys_addr_t page;
unsigned long pd;
PhysPageDesc *p;
while (len > 0) {
page = addr & TARGET_PAGE_MASK;
l = (page + TARGET_PAGE_SIZE) - addr;
if (l > len)
l = len;
p = phys_page_find(page >> TARGET_PAGE_BITS);
if (!p) {
pd = IO_MEM_UNASSIGNED;
} else {
pd = p->phys_offset;
}
if (is_write) {
if ((pd & ~TARGET_PAGE_MASK) != IO_MEM_RAM) {
target_phys_addr_t addr1 = addr;
io_index = (pd >> IO_MEM_SHIFT) & (IO_MEM_NB_ENTRIES - 1);
if (p)
addr1 = (addr & ~TARGET_PAGE_MASK) + p->region_offset;
/* XXX: could force cpu_single_env to NULL to avoid
potential bugs */
if (l >= 4 && ((addr1 & 3) == 0)) {
/* 32 bit write access */
val = ldl_p(buf);
io_mem_write[io_index][2](io_mem_opaque[io_index], addr1, val);
l = 4;
} else if (l >= 2 && ((addr1 & 1) == 0)) {
/* 16 bit write access */
val = lduw_p(buf);
io_mem_write[io_index][1](io_mem_opaque[io_index], addr1, val);
l = 2;
} else {
/* 8 bit write access */
val = ldub_p(buf);
io_mem_write[io_index][0](io_mem_opaque[io_index], addr1, val);
l = 1;
}
} else {
unsigned long addr1;
addr1 = (pd & TARGET_PAGE_MASK) + (addr & ~TARGET_PAGE_MASK);
/* RAM case */
ptr = qemu_get_ram_ptr(addr1);
memcpy(ptr, buf, l);
if (!cpu_physical_memory_is_dirty(addr1)) {
/* invalidate code */
tb_invalidate_phys_page_range(addr1, addr1 + l, 0);
/* set dirty bit */
if (!cpu_physical_memory_get_dirty(addr1, MIGRATION_DIRTY_FLAG))
ram_list.dirty_pages++;
ram_list.phys_dirty[addr1 >> TARGET_PAGE_BITS] |=
(0xff & ~CODE_DIRTY_FLAG);
}
/* qemu doesn't execute guest code directly, but kvm does
therefore flush instruction caches */
if (kvm_enabled())
flush_icache_range((unsigned long)ptr,
((unsigned long)ptr)+l);
}
} else {
if ((pd & ~TARGET_PAGE_MASK) > IO_MEM_ROM &&
!(pd & IO_MEM_ROMD)) {
target_phys_addr_t addr1 = addr;
/* I/O case */
io_index = (pd >> IO_MEM_SHIFT) & (IO_MEM_NB_ENTRIES - 1);
if (p)
addr1 = (addr & ~TARGET_PAGE_MASK) + p->region_offset;
if (l >= 4 && ((addr1 & 3) == 0)) {
/* 32 bit read access */
val = io_mem_read[io_index][2](io_mem_opaque[io_index], addr1);
stl_p(buf, val);
l = 4;
} else if (l >= 2 && ((addr1 & 1) == 0)) {
/* 16 bit read access */
val = io_mem_read[io_index][1](io_mem_opaque[io_index], addr1);
stw_p(buf, val);
l = 2;
} else {
/* 8 bit read access */
val = io_mem_read[io_index][0](io_mem_opaque[io_index], addr1);
stb_p(buf, val);
l = 1;
}
} else {
/* RAM case */
ptr = qemu_get_ram_ptr(pd & TARGET_PAGE_MASK) +
(addr & ~TARGET_PAGE_MASK);
memcpy(buf, ptr, l);
}
}
len -= l;
buf += l;
addr += l;
}
}
qemu-kvm 内核态
==============
static int handle_exception(struct kvm_vcpu *vcpu)
{
if (is_page_fault(intr_info)) {
/* EPT won't cause page fault directly */
if (enable_ept)
BUG();
cr2 = vmcs_readl(EXIT_QUALIFICATION);
trace_kvm_page_fault(cr2, error_code);
if (kvm_event_needs_reinjection(vcpu))
kvm_mmu_unprotect_page_virt(vcpu, cr2);
return kvm_mmu_page_fault(vcpu, cr2, error_code);
}
}
int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
{
int r;
enum emulation_result er;
r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code);
if (r < 0)
goto out;
if (!r) {
r = 1;
goto out;
}
r = mmu_topup_memory_caches(vcpu);
if (r)
goto out;
er = emulate_instruction(vcpu, cr2, error_code, 0);
switch (er) {
case EMULATE_DONE:
return 1;
case EMULATE_DO_MMIO:
++vcpu->stat.mmio_exits;
return 0;
case EMULATE_FAIL:
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
vcpu->run->internal.ndata = 0;
return 0;
default:
BUG();
}
out:
return r;
}
static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
u32 error_code)
{
/* mmio */
if (is_error_pfn(pfn)) {
pgprintk("gfn %lx is mmio\n", walker.gfn);
kvm_release_pfn_clean(pfn);
return is_fault_pfn(pfn) ? -EFAULT : 1;
}
}
int emulate_instruction(struct kvm_vcpu *vcpu,
unsigned long cr2,
u16 error_code,
int emulation_type)
{
if ((r || vcpu->mmio_is_write) && run) {
run->exit_reason = KVM_EXIT_MMIO;
run->mmio.phys_addr = vcpu->mmio_phys_addr;
memcpy(run->mmio.data, vcpu->mmio_data, 8);
run->mmio.len = vcpu->mmio_size;
run->mmio.is_write = vcpu->mmio_is_write;
}
if (r) {
if (reexecute_instruction(vcpu, cr2))
return EMULATE_DONE;
if (!vcpu->mmio_needed) {
kvm_report_emulation_failure(vcpu, "mmio");
return EMULATE_FAIL;
}
return EMULATE_DO_MMIO;
}
kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
if (vcpu->mmio_is_write) {
vcpu->mmio_needed = 0;
return EMULATE_DO_MMIO;
}
return EMULATE_DONE;
}
内核态返回到用户态
======================
int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
r = __vcpu_run(vcpu);
out:
post_kvm_run_save(vcpu);
if (vcpu->sigset_active)
sigprocmask(SIG_SETMASK, &sigsaved, NULL);
vcpu_put(vcpu);
return r;
}
static void post_kvm_run_save(struct kvm_vcpu *vcpu)
{
struct kvm_run *kvm_run = vcpu->run;
kvm_run->if_flag = (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
kvm_run->cr8 = kvm_get_cr8(vcpu);
kvm_run->apic_base = kvm_get_apic_base(vcpu);
if (irqchip_in_kernel(vcpu->kvm))
kvm_run->ready_for_interrupt_injection = 1;
else
kvm_run->ready_for_interrupt_injection =
kvm_arch_interrupt_allowed(vcpu) &&
!kvm_cpu_has_interrupt(vcpu) &&
!kvm_event_needs_reinjection(vcpu);
}
2 coalesced mmio截获和模拟流程。
这种方式模拟 MMIO 会被 KVM 内核截取,但 KVM 并不会立即跳出到 qemu-kvm 用户空间,KVM
将需要模拟的读写操作形成一个记录 (struct kvm_coalesced_mmio), 放在在代表整个 VM 的 struct kvm
所指向的一个环形缓冲区中 (struct kvm_coalesced_mmio_ring), 这个环形缓冲区被 mmap 到了用户空间。
当下一次代表某个 VCPU 的 qemu-kvm 线程返回到用户空间后,就会对环形缓冲区中的记录进行处理,执行 MMIO
读写模拟。 对于这种方式, qemu-kvm 一次模拟的可能是已经被积累起来的多个 MMIO 读写操作,
显然这种方式是一种性能优化,它适合于对响应时间要求不是很严格的 MMIO 写操作。
1.用户态对mmio地址的读写函数注册
2.然后MMIO 会被 KVM 内核截获
3.不返回到用户空间,积累起来的多个 MMIO 读写操作。
qemu-kvm 用户态
=================
设备都注册自己特定coalesced mmio地址的读写函数(read/write) 函数
d->mmio_index = cpu_register_io_memory(e1000_mmio_read, e1000_mmio_write, d);
cpu_register_physical_memory(addr, PNPMMIO_SIZE, d->mmio_index);
qemu_register_coalesced_mmio(addr, excluded_regs[0]);
qemu-kvm 内核态
==============
int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
{
int r;
enum emulation_result er;
r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code);
if (r < 0)
goto out;
if (!r) {
r = 1;
goto out;
}
r = mmu_topup_memory_caches(vcpu);
if (r)
goto out;
er = emulate_instruction(vcpu, cr2, error_code, 0);
switch (er) {
case EMULATE_DONE:
return 1;
case EMULATE_DO_MMIO:
++vcpu->stat.mmio_exits;
return 0;
case EMULATE_FAIL:
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
vcpu->run->internal.ndata = 0;
return 0;
default:
BUG();
}
out:
return r;
}
static int emulator_write_emulated_onepage(unsigned long addr,
const void *val,
unsigned int bytes,
struct kvm_vcpu *vcpu)
{
/* For APIC access vmexit */
if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
goto mmio;
if (emulator_write_phys(vcpu, gpa, val, bytes))
return X86EMUL_CONTINUE;
mmio:
trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
/*
* Is this MMIO handled locally?
*/
if (!vcpu_mmio_write(vcpu, gpa, bytes, val))
return X86EMUL_CONTINUE;
vcpu->mmio_needed = 1;
vcpu->mmio_phys_addr = gpa;
vcpu->mmio_size = bytes;
vcpu->mmio_is_write = 1;
memcpy(vcpu->mmio_data, val, bytes);
return X86EMUL_CONTINUE;
}
static int coalesced_mmio_write(struct kvm_io_device *this,
gpa_t addr, int len, const void *val)
{
struct kvm_coalesced_mmio_dev *dev = to_mmio(this);
struct kvm_coalesced_mmio_ring *ring = dev->kvm->coalesced_mmio_ring;
if (!coalesced_mmio_in_range(dev, addr, len))
return -EOPNOTSUPP;
spin_lock(&dev->lock);
/* copy data in first free entry of the ring */
ring->coalesced_mmio[ring->last].phys_addr = addr;
ring->coalesced_mmio[ring->last].len = len;
memcpy(ring->coalesced_mmio[ring->last].data, val, len);
smp_wmb();
ring->last = (ring->last + 1) % KVM_COALESCED_MMIO_MAX;
spin_unlock(&dev->lock);
return 0;
}
内核态返回到用户态
==================
不返回用户态,积累起来的多个 MMIO 读写操作。当其他vm exit退出时,返回用户态,再进行coalesced_mmio处理。
////////////////////////////
3.I/O 端口模拟和截获---pio
io指令大概分为两种单字和字串指令。下面只分析单字io端口。
例如
Intel语法的in、out指令格式为:
IN 累加器, {端口号│DX}
OUT {端口号│DX},累加器
1.设备端口的注册
以内核模拟i8259设备为例说明,如果采用用户空间模拟设备,需要返回到用户空间处理handle_io,过程稍复杂。
kvm_iodevice_init(&s->dev, &picdev_ops);
mutex_lock(&kvm->slots_lock);
ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, &s->dev);
mutex_unlock(&kvm->slots_lock);
if (ret < 0) {
kfree(s);
return NULL;
}
2.截获io端口操作
static int handle_io(struct kvm_vcpu *vcpu)
{
unsigned long exit_qualification;
int size, in, string;
unsigned port;
++vcpu->stat.io_exits;
exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
string = (exit_qualification & 16) != 0;
if (string) {
if (emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO)
return 0;
return 1;
}
size = (exit_qualification & 7) + 1;
in = (exit_qualification & 8) != 0;
port = exit_qualification >> 16;
skip_emulated_instruction(vcpu);
return kvm_emulate_pio(vcpu, in, size, port);
}
int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in, int size, unsigned port)
{
unsigned long val;
trace_kvm_pio(!in, port, size, 1);
vcpu->run->exit_reason = KVM_EXIT_IO;
vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
vcpu->run->io.size = vcpu->arch.pio.size = size;
vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = 1;
vcpu->run->io.port = vcpu->arch.pio.port = port;
vcpu->arch.pio.in = in;
vcpu->arch.pio.string = 0;
vcpu->arch.pio.down = 0;
vcpu->arch.pio.rep = 0;
val = kvm_register_read(vcpu, VCPU_REGS_RAX);
memcpy(vcpu->arch.pio_data, &val, 4);
if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
complete_pio(vcpu);
return 1;
}
return 0;
}
3.调用i8259设备注册读写函数,进行模拟
static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
{
/* TODO: String I/O for in kernel device */
int r;
if (vcpu->arch.pio.in)
r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port,
vcpu->arch.pio.size, pd);
else
r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS,
vcpu->arch.pio.port, vcpu->arch.pio.size,
pd);
return r;
}
int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
int len, void *val)
{
int i;
struct kvm_io_bus *bus = rcu_dereference(kvm->buses[bus_idx]);
for (i = 0; i < bus->dev_count; i++)
if (!kvm_iodevice_read(bus->devs[i], addr, len, val))
return 0;
return -EOPNOTSUPP;
}
static inline int kvm_iodevice_read(struct kvm_io_device *dev,
gpa_t addr, int l, void *v)
{
return dev->ops->read ? dev->ops->read(dev, addr, l, v) : -EOPNOTSUPP;
}
//8259读操作函数
static int picdev_read(struct kvm_io_device *this,
gpa_t addr, int len, void *val)
{
struct kvm_pic *s = to_pic(this);
unsigned char data = 0;
if (!picdev_in_range(addr))
return -EOPNOTSUPP;
if (len != 1) {
if (printk_ratelimit())
printk(KERN_ERR "PIC: non byte read\n");
return 0;
}
pic_lock(s);
switch (addr) {
case 0x20:
case 0x21:
case 0xa0:
case 0xa1:
data = pic_ioport_read(&s->pics[addr >> 7], addr);
break;
case 0x4d0:
case 0x4d1:
data = elcr_ioport_read(&s->pics[addr & 1], addr);
break;
}
*(unsigned char *)val = data;
pic_unlock(s);
return 0;
}
4.pio完成后,保存相应寄存器(rax)到vmcs中。
int complete_pio(struct kvm_vcpu *vcpu)
{
struct kvm_pio_request *io = &vcpu->arch.pio;
long delta;
int r;
unsigned long val;
if (!io->string) {
if (io->in) {
val = kvm_register_read(vcpu, VCPU_REGS_RAX);
memcpy(&val, vcpu->arch.pio_data, io->size);
kvm_register_write(vcpu, VCPU_REGS_RAX, val);
}
}
}
http://blog.csdn.net/zhuriyuxiao/article/details/9233955
阅读(941) | 评论(0) | 转发(0) |