记录一次kvm下Windows蓝屏问题
现象
在linux kernel5.10中,kvm虚拟机windows在设置了硬件断点进行debug时报蓝屏。
kvm log日志:
[12787289.097685] kvm [21277]: vcpu1, guest rIP: 0xfffff80003cd1ae3 vmx_set_msr: BTF | LBR in IA32_DEBUGCTLMSR 0x1, nop
解决方式
临时生效服务器里执行 echo 1 > /sys/module/kvm/parameters/ignore_msrs
或者管理libvirt重新加载kvm 模块,rmmod kvm_intel kvm && modprobe kvm ignore_msrs=1
问题原因
在设置硬件断点时,guest虚拟机操作了CR4、DG和msr寄存器。CR4寄存器包含多个位(标志位),每个位控制一个特定的处理器特性或功能,其中第3位是设置是否启用调试扩展,支持 I/O 断点。DG是调试寄存器。msr寄存器主要用于存储和控制与处理器相关的特定硬件状态或功能,在kvm中控制虚拟机的状态和硬件访问权限。
在设置调试寄存器和msr寄存器时,会通过vm_exit将cpu从non-root mode进入了root mode,内核kvm模块将捕获vm_exit并通过vm_exit的reason进行处理。下面将通过追踪分析,为何会导致蓝屏。
arch/x86/kvm/vmx/vmx.c
/*
关键数据结构
* VMX-specific KVM operations for Intel VT-x support.处理vm_exit的函数handle_exit
*/
static struct kvm_x86_ops vmx_x86_ops __initdata = {
.hardware_unsetup = hardware_unsetup, /* 撤销硬件初始化 */
.hardware_enable = hardware_enable, /* 启用VMX硬件支持 */
.hardware_disable = hardware_disable, /* 禁用VMX硬件支持 */
.cpu_has_accelerated_tpr = report_flexpriority, /* 检查加速TPR支持 */
.has_emulated_msr = vmx_has_emulated_msr, /* 检查模拟MSR支持 */
.vm_size = sizeof(struct kvm_vmx), /* 虚拟机特定数据的大小 */
.vm_init = vmx_vm_init, /* 初始化虚拟机 */
.vcpu_create = vmx_create_vcpu, /* 创建vCPU */
.vcpu_free = vmx_free_vcpu, /* 释放vCPU */
.vcpu_reset = vmx_vcpu_reset, /* 重置vCPU状态 */
.prepare_guest_switch = vmx_prepare_switch_to_guest, /* 准备进入客户机 */
.vcpu_load = vmx_vcpu_load, /* 加载vCPU状态 */
.vcpu_put = vmx_vcpu_put, /* 保存vCPU状态 */
.update_exception_bitmap = vmx_update_exception_bitmap, /* 更新异常位图 */
.get_msr_feature = vmx_get_msr_feature, /* 获取支持的MSR特性 */
.get_msr = vmx_get_msr, /* 获取MSR值 */
.set_msr = vmx_set_msr, /* 设置MSR值 */
.get_segment_base = vmx_get_segment_base, /* 获取段基地址 */
.get_segment = vmx_get_segment, /* 获取段信息 */
.set_segment = vmx_set_segment, /* 设置段信息 */
.set_cr0 = vmx_set_cr0, /* 设置CR0寄存器 */
.is_valid_cr4 = vmx_is_valid_cr4, /* 验证CR4寄存器值 */
.set_cr4 = vmx_set_cr4, /* 设置CR4寄存器 */
.run = vmx_vcpu_run, /* 执行vCPU */
.handle_exit = vmx_handle_exit, /* 处理VM退出 */
...
};
/* 关键数据结构,vmx_init_ops kvm的一些注册函数*/
static struct kvm_x86_init_ops vmx_init_ops __initdata = {
.cpu_has_kvm_support = cpu_has_kvm_support,
.disabled_by_bios = vmx_disabled_by_bios,
.check_processor_compatibility = vmx_check_processor_compat,
.hardware_setup = hardware_setup,
.intel_pt_intr_in_guest = vmx_pt_mode_is_host_guest,
.runtime_ops = &vmx_x86_ops,
};
/*
关键数据结构
handle_exit reason对应的不同的处理函数,其中EXIT_REASON_MSR_WRITE是处理蓝屏报错的关键处理函数
一个函数指针数组,其中每个元素都是指向函数的指针。数组的元素类型是:int (*)(struct kvm_vcpu *vcpu),
这表示数组中的每个元素都是一个指向接收 struct kvm_vcpu * 参数并返回 int 类型值的函数的指针
*/
static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt, /* 外部中断处理 */
[EXIT_REASON_IO_INSTRUCTION] = handle_io, /* I/O指令处理 */
[EXIT_REASON_CR_ACCESS] = handle_cr, /* CR寄存器访问处理 */
[EXIT_REASON_DR_ACCESS] = handle_dr, /* DR寄存器访问处理 */
[EXIT_REASON_CPUID] = kvm_emulate_cpuid, /* 模拟CPUID指令 */
[EXIT_REASON_MSR_READ] = kvm_emulate_rdmsr, /* 模拟读取MSR */
[EXIT_REASON_MSR_WRITE] = kvm_emulate_wrmsr, /* 模拟写入MSR */
[EXIT_REASON_INTERRUPT_WINDOW] = handle_interrupt_window, /* 中断窗口处理 */
[EXIT_REASON_HLT] = kvm_emulate_halt, /* 模拟HLT指令 */
[EXIT_REASON_VMCALL] = kvm_emulate_hypercall, /* 模拟VMCALL指令 */
};
/* vmx 入口函数 */
static int __init vmx_init(void){
...
/* 初始化kvm */
r = kvm_init(&vmx_init_ops, sizeof(struct vcpu_vmx),
__alignof__(struct vcpu_vmx), THIS_MODULE);
if (r)
return r;
...
}
其中kvm_init 调用virt/kvm/kvm_main.c的kvm_init,kvm_init调用hardware_setup,其中对msr寄存器进行操作,**msr寄存器主要用于存储和控制与处理器相关的特定硬件状态或功能,在kvm中控制虚拟机的状态和硬件访问权限。**
/* kvm设置硬件特性函数 */
int kvm_arch_hardware_setup(void *opaque)
{
struct kvm_x86_init_ops *ops = opaque;
int r;
rdmsrl_safe(MSR_EFER, &host_efer);
if (boot_cpu_has(X86_FEATURE_XSAVES))
rdmsrl(MSR_IA32_XSS, host_xss);
/* 调用vmx或者svm的hardware_setup */
r = ops->hardware_setup();
if (r != 0)
return r;
memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops));
kvm_ops_static_call_update();
if (ops->intel_pt_intr_in_guest && ops->intel_pt_intr_in_guest())
kvm_guest_cbs.handle_intel_pt_intr = kvm_handle_intel_pt_intr;
perf_register_guest_info_callbacks(&kvm_guest_cbs);
/* 初始化msr list*/
kvm_init_msr_list();
return 0;
}
/* kvm初始化函数 */
int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
struct module *module)
{
struct kvm_cpu_compat_check c;
int r;
int cpu;
/* 初始化x86架构,检查bios cpu等特性 */
r = kvm_arch_init(opaque);
if (r)
goto out_fail;
/*
* kvm_arch_init makes sure there's at most one caller
* for architectures that support multiple implementations,
* like intel and amd on x86.
* kvm_arch_init must be called before kvm_irqfd_init to avoid creating
* conflicts in case kvm is already setup for another implementation.
*/
/* 初始化中断虚拟化 */
r = kvm_irqfd_init();
if (r)
goto out_irqfd;
if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {
r = -ENOMEM;
goto out_free_0;
}
/* 设置硬件相关 其中通过ops函数指针调用vmx的hardware_setup*/
r = kvm_arch_hardware_setup(opaque);
if (r < 0)
goto out_free_1;
c.ret = &r;
c.opaque = opaque;
/* 检查cpu兼容性 */
for_each_online_cpu(cpu) {
smp_call_function_single(cpu, check_processor_compat, &c, 1);
if (r < 0)
goto out_free_2;
}
r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_STARTING, "kvm/cpu:starting",
kvm_starting_cpu, kvm_dying_cpu);
if (r)
goto out_free_2;
register_reboot_notifier(&kvm_reboot_notifier);
...
kvm_init_debug();
/* 初始化vfio相关 供对直接 I/O 访问硬件设备的支持 */
r = kvm_vfio_ops_init();
WARN_ON(r);
return 0;
out_unreg:
kvm_async_pf_deinit();
out_free:
kmem_cache_destroy(kvm_vcpu_cache);
out_free_3:
unregister_reboot_notifier(&kvm_reboot_notifier);
cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING);
out_free_2:
kvm_arch_hardware_unsetup();
out_free_1:
free_cpumask_var(cpus_hardware_enabled);
out_free_0:
kvm_irqfd_exit();
out_irqfd:
kvm_arch_exit();
out_fail:
return r;
}
EXPORT_SYMBOL_GPL(kvm_init);
vmx.c 的设置硬件函数
/*
vmx intel 设置硬件注册函数,由kvm kvm_arch_hardware_setup调用
*/
static __init int hardware_setup(void){
...
vmx_setup_user_return_msrs();
/* 设置vmcs配置文件 */
if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0)
return -EIO;
/* 设置或更新当前支持的 CPU 特性*/
vmx_set_cpu_caps();
/* 为每个cpu设置vmcs */
r = alloc_kvm_area();
if (r)
nested_vmx_hardware_unsetup();
/* 设置 host的中断唤醒回调处理函数 */
kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler);
return r;
}
/* 关键数据结构vmcs */
struct vmcs_hdr {
u32 revision_id:31;
u32 shadow_vmcs:1;
};
/* 关键数据结构vmcs */
struct vmcs {
struct vmcs_hdr hdr;
u32 abort;
char data[];
};
/* 为每个cpu设置vmcs */
static __init int alloc_kvm_area(void)
{
int cpu;
for_each_possible_cpu(cpu) {
struct vmcs *vmcs;
vmcs = alloc_vmcs_cpu(false, cpu, GFP_KERNEL);
if (!vmcs) {
free_kvm_area();
return -ENOMEM;
}
if (static_branch_unlikely(&enable_evmcs))
vmcs->hdr.revision_id = vmcs_config.revision_id;
per_cpu(vmxarea, cpu) = vmcs;
}
return 0;
}
/* 中断唤醒回调处理函数 */
void pi_wakeup_handler(void)
{
struct kvm_vcpu *vcpu;
int cpu = smp_processor_id();
raw_spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
list_for_each_entry(vcpu, &per_cpu(blocked_vcpu_on_cpu, cpu),
blocked_vcpu_list) {
struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
if (pi_test_on(pi_desc) == 1)
/* kick一下cpu */
kvm_vcpu_kick(vcpu);
}
raw_spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
}
/* kick cpu 函数 */
void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
{
int me, cpu;
if (kvm_vcpu_wake_up(vcpu))
return;
/*
* Note, the vCPU could get migrated to a different pCPU at any point
* after kvm_arch_vcpu_should_kick(), which could result in sending an
* IPI to the previous pCPU. But, that's ok because the purpose of the
* IPI is to force the vCPU to leave IN_GUEST_MODE, and migrating the
* vCPU also requires it to leave IN_GUEST_MODE.
*/
me = get_cpu();
if (kvm_arch_vcpu_should_kick(vcpu)) {
cpu = READ_ONCE(vcpu->cpu);
if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
/* 发送IPI,进行重新调度task*/
smp_send_reschedule(cpu);
}
put_cpu();
}
继续查看在kvm_vmx_exit_handlers注册的vmx_handle_exit函数,用于处理vmx_exit
static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
{
int ret = __vmx_handle_exit(vcpu, exit_fastpath);
if (to_vmx(vcpu)->exit_reason.bus_lock_detected) {
if (ret > 0)
vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK;
vcpu->run->flags |= KVM_RUN_X86_BUS_LOCK;
return 0;
}
return ret;
}
static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
{
...
/* 是WRITE_MSR 原因导致的vm_exit*/
if (exit_reason.basic == EXIT_REASON_MSR_WRITE)
return kvm_emulate_wrmsr(vcpu);
...
}
/* 处理msr 写入*/
int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu)
{
u32 ecx = kvm_rcx_read(vcpu);
u64 data = kvm_read_edx_eax(vcpu);
int r;
/* 静态调用机制 kvm_x86_complete_emulated_msr 即 vmx_set_msr*/
r = kvm_set_msr(vcpu, ecx, data);
/* MSR write failed? See if we should ask user space */
if (r && kvm_set_msr_user_space(vcpu, ecx, data, r))
/* Bounce to user space */
return 0;
/* Signal all other negative errors to userspace */
if (r < 0)
return r;
if (!r)
trace_kvm_msr_write(ecx, data);
else
trace_kvm_msr_write_ex(ecx, data);
return static_call(kvm_x86_complete_emulated_msr)(vcpu, r);
}
int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data)
{
return kvm_set_msr_ignored_check(vcpu, index, data, false);
}
static int kvm_set_msr_ignored_check(struct kvm_vcpu *vcpu,
u32 index, u64 data, bool host_initiated)
{
int ret = __kvm_set_msr(vcpu, index, data, host_initiated);
if (ret == KVM_MSR_RET_INVALID)
/* 是否忽略msr不可用返回*/
if (kvm_msr_ignored_check(index, data, true))
ret = 0;
return ret;
}
static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data,
bool host_initiated)
{
...
/* 静态调用机制是内核的一种优化手段,目的是减少动态函数指针调用的性能开销。在传统动态调用中,
调用函数指针需要额外的间接跳转开销(例如通过寄存器保存目标地址)。静态调用通过在编译时或运行时
将调用点的目标函数直接替换成最终实现函数,避免了这一开销 */
/*最终调用vmx_set_msr */
return static_call(kvm_x86_set_msr)(vcpu, &msr);
}
/*
* Writes msr value into the appropriate "register".
* Returns 0 on success, non-0 otherwise.
* Assumes vcpu_load() was already called.
*/
static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info){
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct vmx_uret_msr *msr;
int ret = 0;
u32 msr_index = msr_info->index;
u64 data = msr_info->data;
u32 index;
switch (msr_index) {
...
case MSR_IA32_DEBUGCTLMSR:
if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls &
VM_EXIT_SAVE_DEBUG_CONTROLS)
get_vmcs12(vcpu)->guest_ia32_debugctl = data;
ret = kvm_set_msr_common(vcpu, msr_info);
break;
}
...
}
最终的处理函数:arch/x86/kvm/x86.c
int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
bool pr = false;
u32 msr = msr_info->index;
u64 data = msr_info->data;
switch (msr) {
...
case MSR_IA32_DEBUGCTLMSR:
if (!data) {
/* We support the non-activated case already */
break;
} else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) {
/* Values other than LBR and BTF are vendor-specific,
thus reserved and should throw a #GP */
return 1;
} else if (report_ignored_msrs)
vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
__func__, data);
break;
}
...
}
msr寄存器主要用于存储和控制与处理器相关的特定硬件状态或功能,为了安全起见,除 LBR 和 BTF 外的值是厂商特定的,因此属于保留位,否则会触发 #GP 异常,全称是 General Protection Fault(通用保护异常),它是 x86 架构中由 CPU 硬件触发的一种异常,用于报告违反保护模式规则的操作。此操作可配置,在/sys/module/kvm/parameters/ignore_msrs可以控制是否忽略掉这个错误。