展示自己、证明自己
分类: C/C++
2014-07-07 10:48:07
首先回顾一下kvm的启动过程(http://blog.csdn.net/dashulu/article/details/17074675).qemu 通过调用kvm提供的一系列接口来启动kvm. qemu的入口为vl.c中的main函数,main函数通过调用kvm_init 和 machine->init来初始化kvm. 其中, machine->init会创建vcpu, 用一个线程去模拟vcpu, 该线程执行的函数为qemu_kvm_cpu_thread_fn, 并且该线程最终kvm_cpu_exec,该函数调用kvm_vcpu_ioctl切换到kvm中,下次从kvm中返回时,会接着执行 kvm_vcpu_ioctl之后的代码,判断exit_reason,然后进行相应处理.
int kvm_cpu_exec(CPUState *cpu) { struct kvm_run *run = cpu->kvm_run; int ret, run_ret; DPRINTF("kvm_cpu_exec()\n"); if (kvm_arch_process_async_events(cpu)) { cpu->exit_request = 0; return EXCP_HLT; } do { if (cpu->kvm_vcpu_dirty) { kvm_arch_put_registers(cpu, KVM_PUT_RUNTIME_STATE); cpu->kvm_vcpu_dirty = false; } kvm_arch_pre_run(cpu, run); if (cpu->exit_request) { DPRINTF("interrupt exit requested\n"); /* * KVM requires us to reenter the kernel after IO exits to complete * instruction emulation. This self-signal will ensure that we * leave ASAP again. */ qemu_cpu_kick_self(); } qemu_mutex_unlock_iothread(); run_ret = kvm_vcpu_ioctl(cpu, KVM_RUN, 0); qemu_mutex_lock_iothread(); kvm_arch_post_run(cpu, run); if (run_ret < 0) { if (run_ret == -EINTR || run_ret == -EAGAIN) { DPRINTF("io window exit\n"); ret = EXCP_INTERRUPT; break; } fprintf(stderr, "error: kvm run failed %s\n", strerror(-run_ret)); abort(); } trace_kvm_run_exit(cpu->cpu_index, run->exit_reason); switch (run->exit_reason) { case KVM_EXIT_IO: DPRINTF("handle_io\n"); kvm_handle_io(run->io.port, (uint8_t *)run + run->io.data_offset, run->io.direction, run->io.size, run->io.count); ret = 0; break; case KVM_EXIT_MMIO: DPRINTF("handle_mmio\n"); cpu_physical_memory_rw(run->mmio.phys_addr, run->mmio.data, run->mmio.len, run->mmio.is_write); ret = 0; break; case KVM_EXIT_IRQ_WINDOW_OPEN: DPRINTF("irq_window_open\n"); ret = EXCP_INTERRUPT; break; case KVM_EXIT_SHUTDOWN: DPRINTF("shutdown\n"); qemu_system_reset_request(); ret = EXCP_INTERRUPT; break; case KVM_EXIT_UNKNOWN: fprintf(stderr, "KVM: unknown exit, hardware reason %" PRIx64 "\n", (uint64_t)run->hw.hardware_exit_reason); ret = -1; break; case KVM_EXIT_INTERNAL_ERROR: ret = kvm_handle_internal_error(cpu, run); break; default: DPRINTF("kvm_arch_handle_exit\n"); ret = kvm_arch_handle_exit(cpu, run); break; } } while (ret == 0); if (ret < 0) { cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_CODE); vm_stop(RUN_STATE_INTERNAL_ERROR); } cpu->exit_request = 0; return ret; }kvm_vcpu_ioctl执行时,调用的kvm函数是virt/kvm/kvm-main.c中的kvm_vcpu_ioctl.c函 数.当传入参数为KVM_RUN时,最终会调用到vcpu_enter_guest函数, vcpu_enter_guest函数中调用了kvm_x86_ops->run(vcpu),在intel处理器架构中该函数对应的实现为 vmx_vcpu_run, vmx_vcpu_run设置好寄存器状态之后调用VM_LAUNCH或者VM_RESUME进入guest vm, 一旦发生vm exit则从此处继续执行下去.
static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long debugctlmsr; /*...此处省略n行代码...*/ vmx->__launched = vmx->loaded_vmcs->launched; asm( /* Store host registers */ "push %%" _ASM_DX "; push %%" _ASM_BP ";" "push %%" _ASM_CX " \n\t" /* placeholder for guest rcx */ "push %%" _ASM_CX " \n\t" "cmp %%" _ASM_SP ", %c[host_rsp](%0) \n\t" "je 1f \n\t" "mov %%" _ASM_SP ", %c[host_rsp](%0) \n\t" __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t" "1: \n\t" /* Reload cr2 if changed */ "mov %c[cr2](%0), %%" _ASM_AX " \n\t" "mov %%cr2, %%" _ASM_DX " \n\t" "cmp %%" _ASM_AX ", %%" _ASM_DX " \n\t" "je 2f \n\t" "mov %%" _ASM_AX", %%cr2 \n\t" "2: \n\t" /* Check if vmlaunch of vmresume is needed */ "cmpl $0, %c[launched](%0) \n\t" /* Load guest registers. Don't clobber flags. */ "mov %c[rax](%0), %%" _ASM_AX " \n\t" "mov %c[rbx](%0), %%" _ASM_BX " \n\t" "mov %c[rdx](%0), %%" _ASM_DX " \n\t" "mov %c[rsi](%0), %%" _ASM_SI " \n\t" "mov %c[rdi](%0), %%" _ASM_DI " \n\t" "mov %c[rbp](%0), %%" _ASM_BP " \n\t" #ifdef CONFIG_X86_64 "mov %c[r8](%0), %%r8 \n\t" "mov %c[r9](%0), %%r9 \n\t" "mov %c[r10](%0), %%r10 \n\t" "mov %c[r11](%0), %%r11 \n\t" "mov %c[r12](%0), %%r12 \n\t" "mov %c[r13](%0), %%r13 \n\t" "mov %c[r14](%0), %%r14 \n\t" "mov %c[r15](%0), %%r15 \n\t" #endif "mov %c[rcx](%0), %%" _ASM_CX " \n\t" /* kills %0 (ecx) */ /* Enter guest mode */ "jne 1f \n\t" __ex(ASM_VMX_VMLAUNCH) "\n\t" "jmp 2f \n\t" "1: " __ex(ASM_VMX_VMRESUME) "\n\t" "2: " /* Save guest registers, load host registers, keep flags */ "mov %0, %c[wordsize](%%" _ASM_SP ") \n\t" "pop %0 \n\t" "mov %%" _ASM_AX ", %c[rax](%0) \n\t" "mov %%" _ASM_BX ", %c[rbx](%0) \n\t" __ASM_SIZE(pop) " %c[rcx](%0) \n\t" "mov %%" _ASM_DX ", %c[rdx](%0) \n\t" "mov %%" _ASM_SI ", %c[rsi](%0) \n\t" "mov %%" _ASM_DI ", %c[rdi](%0) \n\t" "mov %%" _ASM_BP ", %c[rbp](%0) \n\t" #ifdef CONFIG_X86_64 "mov %%r8, %c[r8](%0) \n\t" "mov %%r9, %c[r9](%0) \n\t" "mov %%r10, %c[r10](%0) \n\t" "mov %%r11, %c[r11](%0) \n\t" "mov %%r12, %c[r12](%0) \n\t" "mov %%r13, %c[r13](%0) \n\t" "mov %%r14, %c[r14](%0) \n\t" "mov %%r15, %c[r15](%0) \n\t" #endif "mov %%cr2, %%" _ASM_AX " \n\t" "mov %%" _ASM_AX ", %c[cr2](%0) \n\t" "pop %%" _ASM_BP "; pop %%" _ASM_DX " \n\t" "setbe %c[fail](%0) \n\t" ".pushsection .rodata \n\t" ".global vmx_return \n\t" "vmx_return: " _ASM_PTR " 2b \n\t" ".popsection" : : "c"(vmx), "d"((unsigned long)HOST_RSP), [launched]"i"(offsetof(struct vcpu_vmx, __launched)), [fail]"i"(offsetof(struct vcpu_vmx, fail)), [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)), [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])), [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])), [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])), [rdx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDX])), [rsi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RSI])), [rdi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDI])), [rbp]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBP])), #ifdef CONFIG_X86_64 [r8]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R8])), [r9]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R9])), [r10]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R10])), [r11]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R11])), [r12]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R12])), [r13]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R13])), [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])), [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])), #endif [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)), [wordsize]"i"(sizeof(ulong)) : "cc", "memory" #ifdef CONFIG_X86_64 , "rax", "rbx", "rdi", "rsi" , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" #else , "eax", "ebx", "edi", "esi" #endif ); /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */ if (debugctlmsr) update_debugctlmsr(debugctlmsr); #ifndef CONFIG_X86_64 /* * The sysexit path does not restore ds/es, so we must set them to * a reasonable value ourselves. * * We can't defer this to vmx_load_host_state() since that function * may be executed in interrupt context, which saves and restore segments * around it, nullifying its effect. */ loadsegment(ds, __USER_DS); loadsegment(es, __USER_DS); #endif vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP) | (1 << VCPU_EXREG_RFLAGS) | (1 << VCPU_EXREG_CPL) | (1 << VCPU_EXREG_PDPTR) | (1 << VCPU_EXREG_SEGMENTS) | (1 << VCPU_EXREG_CR3)); vcpu->arch.regs_dirty = 0; vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); if (is_guest_mode(vcpu)) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); vmcs12->idt_vectoring_info_field = vmx->idt_vectoring_info; if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) { vmcs12->idt_vectoring_error_code = vmcs_read32(IDT_VECTORING_ERROR_CODE); vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); } } vmx->loaded_vmcs->launched = 1; vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); trace_kvm_exit(vmx->exit_reason, vcpu, KVM_ISA_VMX); vmx_complete_atomic_exit(vmx); vmx_recover_nmi_blocking(vmx); vmx_complete_interrupts(vmx); }
介绍完初始化的流程,可以介绍IO在kvm和qemu中的处理流程了. 当Guest Vm进行IO操作需要访问设备时,就会触发vm exit 返回到vmx_vcpu_run, vmx保存好vmcs并且记录下VM_ExIT_REASON后返回到调用该函数的vcpu_enter_guest, 在vcpu_enter_guest函数末尾调用了r = kvm_x86_ops->handle_exit(vcpu), 该函数对应于vmx_handle_exit函数(intel cpu架构对应关系可以查看vmx.c文件中static struct kvm_x86_ops vmx_x86_ops), vmx_handle_exit 调用kvm_vmx_exit_handlers[exit_reason](vcpu),该语句根据exit_reason调用不同的函数,该数据结构 定义如下:
static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_EXCEPTION_NMI] = handle_exception, [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt, [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault, [EXIT_REASON_NMI_WINDOW] = handle_nmi_window, [EXIT_REASON_IO_INSTRUCTION] = handle_io, [EXIT_REASON_CR_ACCESS] = handle_cr, [EXIT_REASON_DR_ACCESS] = handle_dr, [EXIT_REASON_CPUID] = handle_cpuid, [EXIT_REASON_MSR_READ] = handle_rdmsr, [EXIT_REASON_MSR_WRITE] = handle_wrmsr, [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window, [EXIT_REASON_HLT] = handle_halt, [EXIT_REASON_INVD] = handle_invd, [EXIT_REASON_INVLPG] = handle_invlpg, [EXIT_REASON_RDPMC] = handle_rdpmc, [EXIT_REASON_VMCALL] = handle_vmcall, [EXIT_REASON_VMCLEAR] = handle_vmclear, [EXIT_REASON_VMLAUNCH] = handle_vmlaunch, [EXIT_REASON_VMPTRLD] = handle_vmptrld, [EXIT_REASON_VMPTRST] = handle_vmptrst, [EXIT_REASON_VMREAD] = handle_vmread, [EXIT_REASON_VMRESUME] = handle_vmresume, [EXIT_REASON_VMWRITE] = handle_vmwrite, [EXIT_REASON_VMOFF] = handle_vmoff, [EXIT_REASON_VMON] = handle_vmon, [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, [EXIT_REASON_APIC_ACCESS] = handle_apic_access, [EXIT_REASON_WBINVD] = handle_wbinvd, [EXIT_REASON_XSETBV] = handle_xsetbv, [EXIT_REASON_TASK_SWITCH] = handle_task_switch, [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check, [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig, [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause, [EXIT_REASON_MWAIT_INSTRUCTION] = handle_invalid_op, [EXIT_REASON_MONITOR_INSTRUCTION] = handle_invalid_op, };如果是因为IO原因导致的vm exit,则调用的处理函数为handle_io,handle_io的处理可以查看(http://blog.csdn.net/fanwenyi/article/details/12748613), 该过程结束之后需要qemu去处理IO,这时候会返回到qemu, 在kvm_cpu_exec中继续执行下去,看上面kvm_cpu_exec的代码,如果是因为IO原因返回到qemu,会调用kvm_handle_io函数.
switch (run->exit_reason) { case KVM_EXIT_IO: DPRINTF("handle_io\n"); kvm_handle_io(run->io.port, (uint8_t *)run + run->io.data_offset, run->io.direction, run->io.size, run->io.count); ret = 0; break;kvm_handle_io调用cpu_outb, cpu_outw等指令处理IO操作.
假设虚拟机是用raw格式的磁盘,则IO在qemu中处理时经过的函数栈如下所示:
#0 bdrv_aio_writev (bs=0x55555629e9b0, sector_num=870456, qiov=0x555556715ab0, nb_sectors=1, cb=0x55555570161bbdrv_aio_writev最终调 用bdrv_co_aio_rw_vector函数, 该函数调用co = qemu_coroutine_create(bdrv_co_do_rw) 创建一个协程去执行bdrv_co_do_rw函数,bdrv_co_wo_rw函数的函数栈如下:, opaque=0x5555567157b8) at block.c:3408 #1 0x0000555555701960 in ide_sector_write (s=0x5555567157b8) at hw/ide/core.c:798 #2 0x00005555557047ae in ide_data_writew (opaque=0x555556715740, addr=496, val=8995) at hw/ide/core.c:1907 #3 0x00005555558d9e4c in portio_write (opaque=0x5555565c0670, addr=0, data=8995, size=2) at /home/dashu/kvm/qemu/qemu-dev-zwu/ioport.c:174 #4 0x00005555558e13d5 in memory_region_write_accessor (mr=0x5555565c0670, addr=0, value=0x7fffb4dbd528, size=2, shift=0, mask=65535) at /home/dashu/kvm/qemu/qemu-dev-zwu/memory.c:440 #5 0x00005555558e151d in access_with_adjusted_size (addr=0, value=0x7fffb4dbd528, size=2, access_size_min=1, access_size_max=4, access=0x5555558e1341 , mr=0x5555565c0670) at /home/dashu/kvm/qemu/qemu-dev-zwu/memory.c:477 #6 0x00005555558e3dfb in memory_region_dispatch_write (mr=0x5555565c0670, addr=0, data=8995, size=2) at /home/dashu/kvm/qemu/qemu-dev-zwu/memory.c:984 #7 0x00005555558e7384 in io_mem_write (mr=0x5555565c0670, addr=0, val=8995, size=2) at /home/dashu/kvm/qemu/qemu-dev-zwu/memory.c:1748 #8 0x000055555586a18e in address_space_rw (as=0x555556216d80, addr=496, buf=0x7fffb4dbd670 "##", len=2, is_write=true) at /home/dashu/kvm/qemu/qemu-dev-zwu/exec.c:1968 #9 0x000055555586a474 in address_space_write (as=0x555556216d80, addr=496, buf=0x7fffb4dbd670 "##", len=2) at /home/dashu/kvm/qemu/qemu-dev-zwu/exec.c:2030 #10 0x00005555558d98c9 in cpu_outw (addr=496, val=8995) at /home/dashu/kvm/qemu/qemu-dev-zwu/ioport.c:61
#1 0x000055555563653c in paio_submit (bs=0x5555562a13d0, fd=10, sector_num=2, qiov=0x555556715ab0, nb_sectors=1, cb=0x5555556028b1最终在paio_summit中会往线程池中提交一个请求thread_pool_submit_aio(pool, aio_worker, acb, cb, opaque), 由调度器去执行aio_worker函数,aio_worker是真正做IO操作的函数,它通过pwrite和pread去读取磁盘., opaque=0x555556964e30, type=1) at block/raw-posix.c:825 #2 0x0000555555636659 in raw_aio_submit (bs=0x5555562a13d0, sector_num=2, qiov=0x555556715ab0, nb_sectors=1, cb=0x5555556028b1 , opaque=0x555556964e30, type=1) at block/raw-posix.c:853 #3 0x00005555556366c9 in raw_aio_readv (bs=0x5555562a13d0, sector_num=2, qiov=0x555556715ab0, nb_sectors=1, cb=0x5555556028b1 , opaque=0x555556964e30) at block/raw-posix.c:861 #4 0x00005555556029b8 in bdrv_co_io_em (bs=0x5555562a13d0, sector_num=2, nb_sectors=1, iov=0x555556715ab0, is_write=false) at block.c:4038 #5 0x0000555555602a49 in bdrv_co_readv_em (bs=0x5555562a13d0, sector_num=2, nb_sectors=1, iov=0x555556715ab0) at block.c:4055 #6 0x00005555555fed61 in bdrv_co_do_readv (bs=0x5555562a13d0, sector_num=2, nb_sectors=1, qiov=0x555556715ab0, flags=0) at block.c:2547 #7 0x00005555555fee03 in bdrv_co_readv (bs=0x5555562a13d0, sector_num=2, nb_sectors=1, qiov=0x555556715ab0) at block.c:2573 #8 0x0000555555637d8c in raw_co_readv (bs=0x55555629e9b0, sector_num=2, nb_sectors=1, qiov=0x555556715ab0) at block/raw.c:47 #9 0x00005555555fed61 in bdrv_co_do_readv (bs=0x55555629e9b0, sector_num=2, nb_sectors=1, qiov=0x555556715ab0, flags=0) at block.c:2547 #10 0x00005555556023af in bdrv_co_do_rw
当qemu完成IO操作后,会在kvm_cpu_exec函数的循环中,调用kvm_vcpu_ioctl重新进入kvm.
以上阐述了IO操作在kvm和qemu中处理的整个过程.
参考资料:
1. kvm代码解析连载(二):io的虚拟化:http://blog.csdn.net/fanwenyi/article/details/12748613