openstack在H版中提供了live_snapshot,也就是不影响虚机业务运行的snapshot,代码中live_snapshot实现方法:
nova/virt/libvirt/driver.py
-
def _live_snapshot()
-
-
try:
-
-
# NOTE (rmk): blockRebase cannot be executed on persistent
-
-
# domains, so we need to temporarily undefine it.
-
-
# If any part of this block fails, the domain is
-
-
# re-defined regardless.
-
-
if domain.isPersistent():
-
-
domain.undefine()
-
-
-
-
# NOTE (rmk): Establish a temporary mirror of our root disk and
-
-
# issue an abort once we have a complete copy.
-
-
domain.blockRebase(disk_path, disk_delta, 0,
-
-
libvirt.VIR_DOMAIN_BLOCK_REBASE_COPY |
-
-
libvirt.VIR_DOMAIN_BLOCK_REBASE_REUSE_EXT |
-
-
libvirt.VIR_DOMAIN_BLOCK_REBASE_SHALLOW)
-
-
-
-
while self._wait_for_block_job(domain, disk_path):
-
-
time.sleep(0.5)
-
-
-
-
domain.blockJobAbort(disk_path, 0)
-
-
libvirt_utils.chown(disk_delta, os.getuid())
-
-
finally:
-
-
self._conn.defineXML(xml)
-
-
-
-
def _wait_for_block_job(domain, disk_path, abort_on_error=False):
-
-
status = domain.blockJobInfo(disk_path, 0)
-
-
if status == -1 and abort_on_error:
-
-
msg = _('libvirt error while requesting blockjob info.')
-
-
raise exception.NovaException(msg)
-
-
try:
-
-
cur = status.get('cur', 0)
-
-
end = status.get('end', 0)
-
-
except Exception:
-
-
return False
-
-
-
-
if cur == end and cur != 0 and end != 0:
-
-
return False
-
-
else:
-
-
return True
过程分析:
openstack层面: 首先调用libvirt接口domain.blockRebase发起qemu对于磁盘的“mirror job",然后反复调用libvirt接口domain.blockJobInfo反复查询备份job,当current刻度与offset对齐时,调用domain.blockJobAbort结束job
libvirt层面: domain.blockRebase调用qemu接口drive_mirror,domain.blockJobInfo调用qemu接口info blockjob,domain.blockJobInfo是一个同步接口,先调用qemu blockjob-cancel停止任务,然后不断查询,直到任务被关闭才返回
qemu层面:mirror任务的注释”Start mirroring a block device's writes to a new destination,using the specified target.“,其中重要循环:
block/mirror.c
-
static void coroutine_fn mirror_run(void *opaque)
-
-
-
-
for (;;) {
-
-
uint64_t delay_ns;
-
-
int64_t cnt;
-
-
bool should_complete;
-
-
-
-
if (s->ret < 0) {
-
-
ret = s->ret;
-
-
goto immediate_exit;
-
-
}
-
-
-
-
cnt = bdrv_get_dirty_count(bs, s->dirty_bitmap);
-
-
-
-
/* Note that even when no rate limit is applied we need to yield
-
-
* periodically with no pending I/O so that qemu_aio_flush() returns.
-
-
* We do so every SLICE_TIME nanoseconds, or when there is an error,
-
-
* or when the source is clean, whichever comes first.
-
-
*/
-
-
if (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - last_pause_ns < SLICE_TIME &&
-
-
s->common.iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
-
-
if (s->in_flight == MAX_IN_FLIGHT || s->buf_free_count == 0 ||
-
-
(cnt == 0 && s->in_flight > 0)) {
-
-
trace_mirror_yield(s, s->in_flight, s->buf_free_count, cnt);
-
-
qemu_coroutine_yield();
-
-
continue;
-
-
} else if (cnt != 0) {
-
-
mirror_iteration(s);
-
-
continue;
-
-
}
-
-
}
-
-
-
-
should_complete = false;
-
-
if (s->in_flight == 0 && cnt == 0) {
-
-
trace_mirror_before_flush(s);
-
-
ret = bdrv_flush(s->target);
-
-
if (ret < 0) {
-
-
if (mirror_error_action(s, false, -ret) == BDRV_ACTION_REPORT) {
-
-
goto immediate_exit;
-
-
}
-
-
} else {
-
-
/* We're out of the streaming phase. From now on, if the job
-
-
* is cancelled we will actually complete all pending I/O and
-
-
* report completion. This way, block-job-cancel will leave
-
-
* the target in a consistent state.
-
-
*/
-
-
s->common.offset = end * BDRV_SECTOR_SIZE;
-
-
if (!s->synced) {
-
-
block_job_ready(&s->common);
-
-
s->synced = true;
-
-
}
-
-
-
-
should_complete = s->should_complete ||
-
-
block_job_is_cancelled(&s->common);
-
-
cnt = bdrv_get_dirty_count(bs, s->dirty_bitmap);
-
-
}
-
-
}
-
-
-
-
if (cnt == 0 && should_complete) {
-
-
/* The dirty bitmap is not updated while operations are pending.
-
-
* If we're about to exit, wait for pending operations before
-
-
* calling bdrv_get_dirty_count(bs), or we may exit while the
-
-
* source has dirty data to
-
-
*
-
-
* Note that I/O can be submitted by the guest while
-
-
* mirror_populate runs.
-
-
*/
-
-
trace_mirror_before_drain(s, cnt);
-
-
bdrv_drain_all();
-
-
cnt = bdrv_get_dirty_count(bs, s->dirty_bitmap);
-
-
}
-
-
-
-
ret = 0;
-
-
trace_mirror_before_sleep(s, cnt, s->synced);
-
-
if (!s->synced) {
-
-
/* Publish progress */
-
-
s->common.offset = (end - cnt) * BDRV_SECTOR_SIZE;
-
-
-
-
if (s->common.speed) {
-
-
delay_ns = ratelimit_calculate_delay(&s->limit, sectors_per_chunk);
-
-
} else {
-
-
delay_ns = 0;
-
-
}
-
-
-
-
block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, delay_ns);
-
-
if (block_job_is_cancelled(&s->common)) {
-
-
break;
-
-
}
-
-
} else if (!should_complete) {
-
-
delay_ns = (s->in_flight == 0 && cnt == 0 ? SLICE_TIME : 0);
-
-
block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, delay_ns);
-
-
} else if (cnt == 0) {
-
-
/* The two disks are in sync. Exit and report successful
-
-
* completion.
-
-
*/
-
-
assert(QLIST_EMPTY(&bs->tracked_requests));
-
-
s->common.cancelled = false;
-
-
break;
-
-
}
-
-
last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
-
-
}
同步任务不断循环检查脏数据,有两种退出可能:1.source和target未同步时就设置了job->canceled
2.source和target 2.source和target同步后,should_complete &&
迭代中脏页计数为0,而should_complete成立的条件是脏页为0的迭代并且job设置了退出;所以在这个设备不断IO的情况下,只有一个很小
的空当可以通过设置job状态而退出,而上层的openstack通过sleep(0.5)来钻这个空当,呵呵。
带来的后果:
1.同步任务会一直进行下去,直到GUEST OS中IO停止,造成宿主机资源一直被占用
2.libvirt的.blockJobAbort接口一直不返回,如果nova调用libvirt设置的阻塞方式,则nova也会被卡主
复现问题的方法:
将sleep(x)的值调大后非常容易复现这个竞争场景,默认的0,5也有出现的机会
解决方法:
1.在qemu中加入强制退出job的流程
2.慎用mirror接口,采用其他方法在线备份
阅读(3579) | 评论(0) | 转发(0) |