现象:
新部署环境(openstack+qemu+glusterfs)后,启动虚拟机即I/O error
分析
查看qemu与libvirt log,没有可用的报错
查看glusterfs-brick log,找有如下报错
-
[2014-11-28 09:03:57.156373] E [posix.c:2135:posix_writev] 0-test-posix: write failed: offset 0, Invalid argument
-
[2014-11-28 09:03:57.156421] I [server-rpc-fops.c:1439:server_writev_cbk] 0-test-server: 21: WRITEV 0 (dd0085c9-9844-44c7-9d39-3b9ec0ca65b1) ==> (Invalid argument)
-
[2014-11-28 09:04:34.098004] E [posix.c:2135:posix_writev] 0-test-posix: write failed: offset 0, Invalid argument
-
[2014-11-28 09:04:34.098046] I [server-rpc-fops.c:1439:server_writev_cbk] 0-test-server: 30: WRITEV 0 (dd0085c9-9844-44c7-9d39-3b9ec0ca65b1) ==> (Invalid argument)
我们知道,openstack/qemu为了数据安全与热迁移的可用性,虚拟机磁盘缓存策略默认采用"cache=none",也就是使用O_DIRECT打开磁盘文件,相关代码
-
qemu/block.c
-
int bdrv_parse_cache_flags(const char *mode, int *flags)
-
{
-
*flags &= ~BDRV_O_CACHE_MASK;
-
-
if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
-
*flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
-
} else if (!strcmp(mode, "directsync")) {
-
*flags |= BDRV_O_NOCACHE;
-
} else if (!strcmp(mode, "writeback")) {
-
*flags |= BDRV_O_CACHE_WB;
-
} else if (!strcmp(mode, "unsafe")) {
-
*flags |= BDRV_O_CACHE_WB;
-
*flags |= BDRV_O_NO_FLUSH;
-
} else if (!strcmp(mode, "writethrough")) {
-
/* this is the default */
-
} else {
-
return -1;
-
}
-
-
return 0;
-
}
-
qemu/block/raw_posix.c
-
static void raw_parse_flags(int bdrv_flags, int *open_flags)
-
{
-
assert(open_flags != NULL);
-
-
*open_flags |= O_BINARY;
-
*open_flags &= ~O_ACCMODE;
-
if (bdrv_flags & BDRV_O_RDWR) {
-
*open_flags |= O_RDWR;
-
} else {
-
*open_flags |= O_RDONLY;
-
}
-
-
/* Use O_DSYNC for write-through caching, no flags for write-back caching,
-
* and O_DIRECT for no caching. */
-
if ((bdrv_flags & BDRV_O_NOCACHE)) {
-
*open_flags |= O_DIRECT;
-
}
-
}
准备一模拟脚本测试:
-
#define _GNU_SOURCE
-
#include <stdio.h>
-
#include <stdlib.h>
-
#include <unistd.h>
-
#include <fcntl.h>
-
#include <sys/mman.h>
-
#include <sys/types.h>
-
#include <sys/stat.h>
-
-
-
int main(int argc, char **argv)
-
{
-
int fd, ret, total;
-
int flags=0;
-
char buf[512];
-
memset(buf, 'x', 512);
-
flags=O_CREAT|O_RDWR|O_DIRECT;
-
printf("open flag %d\n", flags);
-
fd = open("/datas/local//testfile", flags, 0644);
-
if (fd < 0) {
-
printf("error");
-
exit(-1);
-
}
-
total = 512;
-
//void *ptr = mmap(0, total, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
-
//memcpy(ptr, buf, 512);
-
ret = write(fd, buf, 512);
-
printf("write ret = %d\n", ret);
-
ret = close(fd);
-
printf("close ret = %d\n", ret);
-
-
exit(0);
-
}
使用上边的模拟脚本测试时,同样I/O失败,现在转向glusterfs调查关于O_DIRECT的使用,查到一个选项“
network.remote-dio”
-
glusterfs doc:
-
+ group-virt: Change profile to include remote-dio and exclude posix-aio.
-
+
-
+ remote-dio enables filtering O_DIRECT in the client xlator. This has been
-
+ found to be useful for improving performance when there are multiple VMs
-
+ talking to an image store.
-
+
-
+ Aggregated throughput results for a single thread iozone run from multiple VMs
-
+ and a single host can be seen below:
-
+
-
+ -------------------------------------------------
-
+ No. of VMs | remote-dio on | remote-dio off |
-
+ -------------------------------------------------
-
+ 2 | 400 MB/s | 202 MB/s |
-
+ 4 | 650 MB/s | 410 MB/s |
-
+ --------------------------------------------------
-
+
-
+ posix-aio has not been found to improve performance consistently with VM image
-
+ workload. Hence not including that in the default virt profile.
-
+
-
+ Change-Id: I592f68b95a955036f1a985352d2f4950ced1deef
-
+ BUG: 907301
-
+ Signed-off-by: Vijay Bellur <vbellur@redhat.com>
-
+ Reviewed-on: http://review.gluster.org/4460
-
+ Reviewed-by: Anand Avati <avati@redhat.com>
-
+ Tested-by: Anand Avati <avati@redhat.com>
network.remote-dio 选项默认为disable,使用命令"gluster volume set test-vol
network.remote-dio on",测试I/O与虚拟机均恢复正常,接着看看这个选项究竟做了什么
-
gluster-volumes-set.c
-
{ .key = "network.remote-dio",
-
.voltype = "protocol/client",
-
.option = "filter-O_DIRECT",
-
.op_version = 2,
-
.flags = OPT_FLAG_CLIENT_OPT
-
},
-
client.c
-
{ .key = {"filter-O_DIRECT"},
-
.type = GF_OPTION_TYPE_BOOL,
-
.default_value = "disable",
-
.description = "If enabled, in open() and creat() calls, O_DIRECT "
-
"flag will be filtered at the client protocol level so server will "
-
"still continue to cache the file. This works similar to NFS's "
-
"behavior of O_DIRECT",
-
},
-
{ .key = {NULL} },
-
int32_t
-
client_open (call_frame_t *frame, xlator_t *this, loc_t *loc,
-
int32_t flags, fd_t *fd, dict_t *xdata)
-
{
-
int ret = -1;
-
clnt_conf_t *conf = NULL;
-
rpc_clnt_procedure_t *proc = NULL;
-
clnt_args_t args = {0,};
-
-
conf = this->private;
-
if (!conf || !conf->fops)
-
goto out;
-
-
args.loc = loc;
-
args.fd = fd;
-
args.xdata = xdata;
-
-
if (!conf->filter_o_direct)
-
args.flags = flags;
-
else
-
args.flags = (flags & ~O_DIRECT);
-
io-cache.c
-
/* If O_DIRECT open, we disable caching on it */
-
if ((local->flags & O_DIRECT)){
-
/* O_DIRECT is only for one fd, not the inode
-
* as a whole
-
*/
-
fd_ctx_set (fd, this, 1);
-
}
-
posix.c
-
int32_t
-
posix_open (call_frame_t *frame, xlator_t *this,
-
loc_t *loc, int32_t flags, fd_t *fd, dict_t *xdata)
-
{
-
int32_t op_ret = -1;
-
int32_t op_errno = 0;
-
char *real_path = NULL;
-
int32_t _fd = -1;
-
struct posix_fd *pfd = NULL;
-
struct posix_private *priv = NULL;
-
struct iatt stbuf = {0, };
-
-
DECLARE_OLD_FS_ID_VAR;
-
-
VALIDATE_OR_GOTO (frame, out);
-
VALIDATE_OR_GOTO (this, out);
-
VALIDATE_OR_GOTO (this->private, out);
-
VALIDATE_OR_GOTO (loc, out);
-
VALIDATE_OR_GOTO (fd, out);
-
-
priv = this->private;
-
VALIDATE_OR_GOTO (priv, out);
-
-
MAKE_INODE_HANDLE (real_path, this, loc, &stbuf);
-
-
op_ret = -1;
-
SET_FS_ID (frame->root->uid, frame->root->gid);
-
-
if (priv->o_direct)
-
flags |= O_DIRECT;
-
-
_fd = open (real_path, flags, 0);
简单的来说,如果使能了
network.remote-dio ,客户端将自行处理O_DIRECT(影响io-cache与read-ahead),而不将它反映到brick下边的文件系统;
qemu社区有贴:https://lists.gnu.org/archive/html/qemu-devel/2012-10/msg00446.html
-
:What is the effect of O_DIRECT on the client exactly?
-
:To avoid caching in the io-cache module, disable read-ahead etc (if those translators are loaded). The behavior in write-behind is tunable. You could either disable write-behind entirely (which will happen once libgfapi supports 0-copy/RDMA) or perform a sliding-window like size-limited write-behind (defaults to 1MB).
现在可以推测出,是glusterfs brick使用的文件系统不支持O_DIRECT,使用脚本测试果然如此。后经询问,这次部署的brick切换到xfs了,关于xfs对于O_DIRECT的支持找到了如下BUG:
至此问题已搞清楚,当glusterfs brick使用xfs文件系统时,需要注意上层应用对O_DIRECT的使用
解决办法
任选一种
1.在nova.conf中设置指定磁盘缓存策略
-
[libvirt]
-
disk_cachemodes="file=writethrough"
2.设置glusterfs volume开启
network.remote-dio
-
gluster volume set volume-name network.remote-dio on
3.xfs mkfs时指定sector大小为512
-
ex:
-
mkfs.xfs -f -s 512 /dev/mapper/image-glance
阅读(3358) | 评论(0) | 转发(0) |