随着IT行业的发展,Linux容器一直是比较火的话题,这种轻量级的虚拟机逐渐替代vmware或者hypervisor成为新一代武林盟主。既然是虚拟机,那么它势必满足虚拟机的特点:虚拟机(container)和虚拟机之间相互独立,可以理解为两个不同的设备,一个大概的框图如下:
从上图可见,Linux下的容器是运行在OS之上的虚拟机(OS共享),因此这和传统的vmware和hypervisor并不相同。最初运行这个容器(虚拟机)在Linux下是采用的LXC来运行的,但是后来dotcloud改成了runc(基于go语言),当然github上面也有c语言版的运行时crun:。他们二者主要都是基于Linux内核提供的机制cgroup和namespace来实现的。下图为docker的一个模型,
蓝色是Linux下的可执行指令,绿色为守护进程,紫色为虚拟机, 下图为google的k8s的模型框图(google引出了CRI接口),
因此,不管从docker或者是k8s(k8s兼容了docker),我们都可以看到一个runc的结点,这个结点就是本章分析的容器运行时,那么什么叫容器运行时(runtime),这是一种和操作系统强相关的运行程序(最初并没有和contaierd分开),也就是主要在对应的操作系统上动态建立一个虚拟机,然后用该虚拟机来运行容器镜像里面的app。因此,我们可以知道,容器镜像里面一般只有app和app依赖的库文件,并不包含操作系统镜像,这也就是容器更轻量级的原因。
一个容器运行时,所需要的环境bundle包必须在bundle里面包含一个满足runtime规范的配置文件config.json和一个目录结构,通常是rootfs, 这个config.json可以通过runc spec生成,如下图,
注:运行时的配置文件config.json必须满足运行时规范:; 容器镜像必须满足镜像规范:https://github.com/opencontainers/image-spec, 其中runc/crun只需满足运行时规范。
crun的main函数在crun.c中,实现如下:
-
int
-
main (int argc, char **argv)
-
{
-
libcrun_error_t err = NULL;
-
int ret, first_argument;
-
-
argp_program_version_hook = print_version;
-
-
// C库的参数解析函数,first_argument得到第一个自己解析的索引
-
argp_parse (&argp, argc, argv, ARGP_IN_ORDER, &first_argument, &arguments);
-
-
// 通过名字获取到操作函数,这里是create, ps, start, run , exec等函数操作
-
command = get_command (argv[first_argument]);
-
if (command == NULL)
-
libcrun_fail_with_error (0, "unknown command %s", argv[first_argument]);
-
-
// 执行对应的操作函数
-
ret = command->handler (&arguments, argc - first_argument, argv + first_argument, &err);
-
if (ret && err)
-
libcrun_fail_with_error (err->status, "%s", err->msg);
-
return ret;
-
}
这里我们只将create函数,实现如下:
点击(此处)折叠或打开
-
int crun_command_create (struct crun_global_arguments *global_args, int argc, char **argv, libcrun_error_t *err)
-
{
-
int first_arg, ret;
-
libcrun_container_t *container;
-
cleanup_free char *bundle_cleanup = NULL;
-
-
crun_context.preserve_fds = 0;
-
// 同样的,调用C的解析函数,解析CREATE的子参数
-
argp_parse (&run_argp, argc, argv, ARGP_IN_ORDER, &first_arg, &crun_context);
-
-
crun_assert_n_args (argc - first_arg, 1, 1);
-
-
/* 确定bundle的路径. */
-
if (bundle)
-
{
-
if (bundle[0] != '/')
-
{
-
bundle_cleanup = realpath (bundle, NULL);
-
if (bundle_cleanup == NULL)
-
libcrun_fail_with_error (errno, "realpath `%s` failed", bundle);
-
bundle = bundle_cleanup;
-
}
-
-
if (chdir (bundle) < 0)
-
libcrun_fail_with_error (errno, "chdir `%s` failed", bundle);
-
}
-
-
// 初始化上下文环境
-
ret = init_libcrun_context (&crun_context, argv[first_arg], global_args, err);
-
if (UNLIKELY (ret < 0))
-
return ret;
-
-
// 解析 config.json数据,并保存到container->def结构当中
-
container = libcrun_container_load_from_file (config_file, err);
-
if (container == NULL)
-
libcrun_fail_with_error (0, "error loading config.json");
-
-
crun_context.bundle = bundle ? bundle : ".";
-
if (getenv ("LISTEN_FDS"))
-
crun_context.preserve_fds += strtoll (getenv ("LISTEN_FDS"), NULL, 10);
-
-
// 通过配置文件和上下文,创建一个容器
-
return libcrun_container_create (&crun_context, container, 0, err);
-
}
具体的创建函数,libcrun_container_create函数实现如下:
点击(此处)折叠或打开
-
int
-
libcrun_container_create (libcrun_context_t *context, libcrun_container_t *container, unsigned int options, libcrun_error_t *err)
-
{
-
runtime_spec_schema_config_schema *def = container->container_def;
-
int ret;
-
int container_ready_pipe[2];
-
cleanup_close int pipefd0 = -1;
-
cleanup_close int pipefd1 = -1;
-
cleanup_close int exec_fifo_fd = -1;
-
context->detach = 1; // 为1表示,crun create 指令不阻塞,直接返回(只有crun run这个detach是0)
-
container->context = context;
-
// 检查oci版本
-
if (def->oci_version && strstr (def->oci_version, "1.0") == NULL)
-
return crun_make_error (err, 0, "unknown version specified");
-
// 检查配置文件
-
ret = check_config_file (def, err);
-
if (UNLIKELY (ret < 0))
-
return ret;
-
-
if (def->process && def->process->terminal && context->console_socket == NULL)
-
return crun_make_error (err, 0, "use --console-socket with create when a terminal is used");
-
// 检查运行时目录,通常为/var/run/crun/*
-
ret = libcrun_status_check_directories (context->state_root, context->id, err);
-
if (UNLIKELY (ret < 0))
-
return ret;
-
// 创建执行等待fifo,主要是给create/start组合使用,run指令没有这个技能,即start会触发fifo让容器运行
-
exec_fifo_fd = libcrun_status_create_exec_fifo (context->state_root, context->id, err);
-
if (UNLIKELY (exec_fifo_fd < 0))
-
return exec_fifo_fd;
-
-
context->fifo_exec_wait_fd = exec_fifo_fd;
-
exec_fifo_fd = -1;
-
-
if ((options & LIBCRUN_RUN_OPTIONS_PREFORK) == 0)
-
{
-
// 将config.json从bundle拷贝到工作目录
-
ret = libcrun_copy_config_file (context->id, context->state_root, context->bundle, err);
-
if (UNLIKELY (ret < 0))
-
return ret;
-
// 创建容器环境,并运行容器
-
ret = libcrun_container_run_internal (container, context, -1, err);
-
if (UNLIKELY (ret < 0))
-
force_delete_container_status (context, def);
-
return ret;
-
}
-
....此处省略,这里不分析的代码....
-
}
libcrun_container_run_internal实现如下:
点击(此处)折叠或打开
-
static int
-
libcrun_container_run_internal (libcrun_container_t *container, libcrun_context_t *context, int container_ready_fd, libcrun_error_t *err)
-
{
-
runtime_spec_schema_config_schema *def = container->container_def;
-
int ret;
-
pid_t pid;
-
int detach = context->detach;
-
cleanup_free char *cgroup_path = NULL;
-
cleanup_free char *scope = NULL;
-
cleanup_close int terminal_fd = -1;
-
cleanup_terminal void *orig_terminal = NULL;
-
cleanup_close int sync_socket = -1;
-
cleanup_close int notify_socket = -1;
-
cleanup_close int socket_pair_0 = -1;
-
cleanup_close int socket_pair_1 = -1;
-
cleanup_close int seccomp_fd = -1;
-
cleanup_close int console_socket_fd = -1;
-
cleanup_close int hooks_out_fd = -1;
-
cleanup_close int hooks_err_fd = -1;
-
int cgroup_mode, cgroup_manager;
-
char created[35];
-
uid_t root_uid = -1;
-
gid_t root_gid = -1;
-
struct container_entrypoint_s container_args =
-
{
-
.container = container,
-
.context = context,
-
.terminal_socketpair = {-1, -1},
-
.console_socket_fd = -1,
-
.hooks_out_fd = -1,
-
.hooks_err_fd = -1,
-
};
-
-
if (def->hooks && (def->hooks->prestart_len
-
|| def->hooks->poststart_len
-
|| def->hooks->create_runtime_len
-
|| def->hooks->create_container_len
-
|| def->hooks->start_container_len))
-
{
-
// 打开hook的输入输出fd
-
ret = open_hooks_output (container, &hooks_out_fd, &hooks_err_fd, err);
-
if (UNLIKELY (ret < 0))
-
return ret;
-
container_args.hooks_out_fd = hooks_out_fd;
-
container_args.hooks_err_fd = hooks_err_fd;
-
}
-
-
container->context = context;
-
-
if (!detach || context->notify_socket)
-
{
-
ret = prctl (PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0);
-
if (UNLIKELY (ret < 0))
-
return crun_make_error (err, errno, "set child subreaper");
-
}
-
-
if (!context->no_new_keyring)
-
{ // 创建keyring
-
ret = libcrun_create_keyring (container->context->id, err);
-
if (UNLIKELY (ret < 0))
-
return ret;
-
}
-
-
if (def->process && def->process->terminal && !detach && context->console_socket == NULL)
-
{
-
container_args.has_terminal_socket_pair = 1;
-
ret = create_socket_pair (container_args.terminal_socketpair, err);
-
if (UNLIKELY (ret < 0))
-
return crun_error_wrap (err, "create terminal socket");
-
-
socket_pair_0 = container_args.terminal_socketpair[0];
-
socket_pair_1 = container_args.terminal_socketpair[1];
-
}
-
// 设置信号为block
-
ret = block_signals (err);
-
if (UNLIKELY (ret < 0))
-
return ret;
-
-
if (def->linux && def->linux->seccomp)
-
{ // 创建seccomp.bpf, 用于系统调用安全检查
-
ret = open_seccomp_output (context->id, &seccomp_fd, false, context->state_root, err);
-
if (UNLIKELY (ret < 0))
-
return ret;
-
}
-
container_args.seccomp_fd = seccomp_fd;
-
-
if (context->console_socket)
-
{// 如果指定了本地socket, 则用本地socket作为容器的标准输入输出
-
console_socket_fd = open_unix_domain_client_socket (context->console_socket, 0, err);
-
if (UNLIKELY (console_socket_fd < 0))
-
return crun_error_wrap (err, "open console socket");
-
container_args.console_socket_fd = console_socket_fd;
-
}
-
// 获取cgroup版本模式等
-
cgroup_mode = libcrun_get_cgroup_mode (err);
-
if (cgroup_mode < 0)
-
return cgroup_mode;
-
// 运行一个linux容器,容器运行后,调用container_init函数初始化
-
pid = libcrun_run_linux_container (container, container_init, &container_args,
-
&sync_socket, err);
-
if (UNLIKELY (pid < 0))
-
return pid;
-
-
if (context->fifo_exec_wait_fd < 0 && context->notify_socket)
-
{
-
/* Do not open the notify socket here on "create". "start" will take care of it. */
-
ret = get_notify_fd (context, container, ?ify_socket, err);
-
if (UNLIKELY (ret < 0))
-
return ret;
-
}
-
-
if (container_args.terminal_socketpair[1] >= 0)
-
close_and_reset (&socket_pair_1);
-
-
cgroup_manager = CGROUP_MANAGER_CGROUPFS;
-
if (context->systemd_cgroup)
-
cgroup_manager = CGROUP_MANAGER_SYSTEMD;
-
else if (context->force_no_cgroup)
-
cgroup_manager = CGROUP_MANAGER_DISABLED;
-
-
get_root_in_the_userns_for_cgroups (def, container->host_uid, container->host_gid, &root_uid, &root_gid);
-
-
{
-
struct libcrun_cgroup_args cg =
-
{
-
.resources = def->linux ? def->linux->resources : NULL,
-
.annotations = def->annotations,
-
.cgroup_mode = cgroup_mode,
-
.path = &cgroup_path,
-
.scope = &scope,
-
.cgroup_path = def->linux ? def->linux->cgroups_path : "",
-
.manager = cgroup_manager,
-
.pid = pid,
-
.root_uid = root_uid,
-
.root_gid = root_gid,
-
.id = context->id,
-
.systemd_subgroup = find_systemd_subgroup (container, cgroup_mode),
-
};
-
// 打开cgroup
-
ret = libcrun_cgroup_enter (&cg, err);
-
if (UNLIKELY (ret < 0))
-
return cleanup_watch (context, pid, sync_socket, terminal_fd, err);
-
// 配置cgroup
-
if (def->linux && def->linux->resources)
-
{
-
ret = libcrun_update_cgroup_resources (cgroup_mode,
-
def->linux->resources,
-
cgroup_path, err);
-
if (UNLIKELY (ret < 0))
-
return cleanup_watch (context, pid, sync_socket, terminal_fd, err);
-
}
-
}
-
-
/* sync 1. */ 通知容器可以进行1初始化
-
ret = sync_socket_send_sync (sync_socket, true, err);
-
if (UNLIKELY (ret < 0))
-
return cleanup_watch (context, pid, sync_socket, terminal_fd, err);
-
-
/* sync 2. */等待容器通知,可以进行2初始化
-
ret = sync_socket_wait_sync (context, sync_socket, false, err);
-
if (UNLIKELY (ret < 0))
-
return cleanup_watch (context, pid, sync_socket, terminal_fd, err);
-
-
// 执行hook函数
-
if (def->hooks && def->hooks->prestart_len)
-
{
-
ret = do_hooks (def, pid, context->id, false, NULL, "created",
-
(hook **) def->hooks->prestart,
-
def->hooks->prestart_len, hooks_out_fd, hooks_err_fd, err);
-
if (UNLIKELY (ret != 0))
-
return cleanup_watch (context, pid, sync_socket, terminal_fd, err);
-
}
-
if (def->hooks && def->hooks->create_runtime_len)
-
{
-
ret = do_hooks (def, pid, context->id, false, NULL, "created",
-
(hook **) def->hooks->create_runtime,
-
def->hooks->create_runtime_len, hooks_out_fd, hooks_err_fd, err);
-
if (UNLIKELY (ret != 0))
-
return cleanup_watch (context, pid, sync_socket, terminal_fd, err);
-
}
-
// 如果支持系统调用安全检查,这里就产生一个bpf文件
-
if (seccomp_fd >= 0)
-
{
-
unsigned int seccomp_gen_options = 0;
-
const char *annotation;
-
-
annotation = find_annotation (container, "run.oci.seccomp_fail_unknown_syscall");
-
if (annotation && strcmp (annotation, "0") != 0)
-
seccomp_gen_options = LIBCRUN_SECCOMP_FAIL_UNKNOWN_SYSCALL;
-
-
ret = libcrun_generate_seccomp (container, seccomp_fd, seccomp_gen_options, err);
-
if (UNLIKELY (ret < 0))
-
return cleanup_watch (context, pid, sync_socket, terminal_fd, err);
-
close_and_reset (&seccomp_fd);
-
}
-
-
/* sync 3. */ 通知容器进程,可以进行3初始化
-
ret = sync_socket_send_sync (sync_socket, true, err);
-
if (UNLIKELY (ret < 0))
-
return cleanup_watch (context, pid, sync_socket, terminal_fd, err);
-
-
if (def->process && def->process->terminal && !detach && context->console_socket == NULL)
-
{ // 如果没有指定本地socket作为容器的终端, 这里创建一个
-
terminal_fd = receive_fd_from_socket (socket_pair_0, err);
-
if (UNLIKELY (terminal_fd < 0))
-
return cleanup_watch (context, pid, sync_socket, terminal_fd, err);
-
-
close_and_reset (&socket_pair_0);
-
// 设置socket的teminal属性
-
ret = libcrun_setup_terminal_master (terminal_fd, &orig_terminal, err);
-
if (UNLIKELY (ret < 0))
-
return cleanup_watch (context, pid, sync_socket, terminal_fd, err);
-
}
-
-
/* sync 4. */ 等待容器通知,可以进行4初始化
-
ret = sync_socket_wait_sync (context, sync_socket, false, err);
-
if (UNLIKELY (ret < 0))
-
return cleanup_watch (context, pid, sync_socket, terminal_fd, err);
-
-
ret = close_and_reset (&sync_socket);
-
if (UNLIKELY (ret < 0))
-
return cleanup_watch (context, pid, sync_socket, terminal_fd, err);
-
-
get_current_timestamp (created);
-
// 写容器状态,pid是容器的进程id
-
ret = write_container_status (container, context, pid, cgroup_path, scope, created, err);
-
if (UNLIKELY (ret < 0))
-
return cleanup_watch (context, pid, sync_socket, terminal_fd, err);
-
-
// 执行running hook函数
-
if (context->fifo_exec_wait_fd < 0 && def->hooks && def->hooks->poststart_len)
-
{
-
ret = do_hooks (def, pid, context->id, true, NULL, "running",
-
(hook **) def->hooks->poststart,
-
def->hooks->poststart_len, hooks_out_fd, hooks_err_fd, err);
-
if (UNLIKELY (ret < 0))
-
return cleanup_watch (context, pid, sync_socket, terminal_fd, err);
-
}
-
// 如果detach是true则return;这个wait主要给crun run使用, 对于create就是一个空函数
-
ret = wait_for_process (pid, context, terminal_fd, notify_socket, container_ready_fd, err);
-
if (!context->detach)
-
{
-
cleanup_watch (context, 0, sync_socket, terminal_fd, err);
-
crun_error_release (err);
-
}
-
-
return ret;
-
}
创建一个容器,并运行libcrun_run_linux_container实现如下:
点击(此处)折叠或打开
-
pid_t
-
libcrun_run_linux_container (libcrun_container_t *container,
-
container_entrypoint_t entrypoint,
-
void *args,
-
int *sync_socket_out,
-
libcrun_error_t *err)
-
{
-
__attribute__((cleanup (cleanup_free_init_statusp))) struct init_status_s init_status;
-
runtime_spec_schema_config_schema *def = container->container_def;
-
cleanup_close int sync_socket_container = -1;
-
char *notify_socket_env = NULL;
-
cleanup_close int sync_socket_host = -1;
-
bool clone_can_create_userns;
-
int sync_socket[2];
-
pid_t pid;
-
size_t i;
-
int ret;
-
// 初始化命名空间,打开命名空间fd
-
ret = configure_init_status (&init_status, container, err);
-
if (UNLIKELY (ret < 0))
-
return ret;
-
-
get_private_data (container)->unshare_flags = init_status.all_namespaces;
-
#ifdef CLONE_NEWCGROUP
-
/* cgroup will be unshared later. Once the process is in the correct cgroup. */
-
init_status.all_namespaces &= ~CLONE_NEWCGROUP;
-
#endif
-
// 创建 容器和主进程通信的socket
-
ret = socketpair (AF_UNIX, SOCK_SEQPACKET | SOCK_CLOEXEC, 0, sync_socket);
-
if (UNLIKELY (ret < 0))
-
return crun_make_error (err, errno, "socketpair");
-
-
sync_socket_host = sync_socket[0];
-
sync_socket_container = sync_socket[1];
-
-
get_uid_gid_from_def (container->container_def,
-
&container->container_uid,
-
&container->container_gid);
-
-
/* 设置rlimis值 */
-
if (def->process)
-
{
-
ret = libcrun_set_rlimits (def->process->rlimits, def->process->rlimits_len, err);
-
if (UNLIKELY (ret < 0))
-
return ret;
-
}
-
// 设置oom属性
-
ret = libcrun_set_oom (container, err);
-
if (UNLIKELY (ret < 0))
-
return ret;
-
// 配置命名空间相关,是否则在新的容器进程支持对应的命名空间
-
if ((init_status.all_namespaces & CLONE_NEWIPC) && (init_status.all_namespaces & CLONE_NEWUSER))
-
{
-
for (i = 0; i < init_status.fd_len; i++)
-
if (init_status.value[i] == CLONE_NEWIPC)
-
init_status.join_ipcns = true;
-
}
-
-
if (init_status.all_namespaces & CLONE_NEWPID)
-
{
-
init_status.must_fork = true;
-
for (i = 0; i < init_status.fd_len; i++)
-
{
-
if (init_status.value[i] == CLONE_NEWPID)
-
{
-
init_status.join_pidns = true;
-
if (setns (init_status.fd[i], CLONE_NEWPID) == 0)
-
{
-
init_status.namespaces_to_unshare &= ~CLONE_NEWPID;
-
init_status.must_fork = false;
-
close_and_reset (&init_status.fd[i]);
-
}
-
break;
-
}
-
}
-
/* It creates a new PID namespace, without a user namespace, we can try to
-
join it immediately without another fork. */
-
if (i == init_status.fd_len && (init_status.all_namespaces & CLONE_NEWUSER) == 0)
-
{
-
if (unshare (CLONE_NEWPID) == 0)
-
{
-
init_status.namespaces_to_unshare &= ~CLONE_NEWPID;
-
init_status.must_fork = false;
-
}
-
}
-
}
-
#ifdef CLONE_NEWTIME
-
if (init_status.all_namespaces & CLONE_NEWTIME)
-
init_status.must_fork = true;
-
#endif
-
-
clone_can_create_userns = init_status.fd_len == 0;
-
-
if ((init_status.all_namespaces & CLONE_NEWUSER) && init_status.userns_index < 0)
-
init_status.delayed_userns_create = !clone_can_create_userns || init_status.fd_len > 0;
-
-
// 创建一个容器进程
-
pid = syscall_clone ((init_status.namespaces_to_unshare & (clone_can_create_userns ? CLONE_NEWUSER : 0)) | SIGCHLD, NULL);
-
if (UNLIKELY (pid < 0))
-
return crun_make_error (err, errno, "clone");
-
-
if (clone_can_create_userns)
-
init_status.namespaces_to_unshare &= ~CLONE_NEWUSER;
-
-
if (pid)
-
{ // 主进程 保存容器额外的信息
-
ret = save_external_descriptors (container, pid, err);
-
if (UNLIKELY (ret < 0))
-
return ret;
-
// 关闭主进程不需要的socket,即容器用的socket
-
ret = close_and_reset (&sync_socket_container);
-
if (UNLIKELY (ret < 0))
-
return crun_make_error (err, errno, "close");
-
// 如果是clone_newuser,需要等待容器ready
-
if (init_status.all_namespaces & CLONE_NEWUSER)
-
{
-
if (init_status.delayed_userns_create)
-
{
-
ret = expect_success_from_sync_socket (sync_socket_host, err);
-
if (UNLIKELY (ret < 0))
-
return ret;
-
}
-
-
if (init_status.userns_index < 0)
-
{
-
ret = libcrun_set_usernamespace (container, pid, err);
-
if (UNLIKELY (ret < 0))
-
return ret;
-
-
ret = TEMP_FAILURE_RETRY (write (sync_socket_host, "1", 1));
-
if (UNLIKELY (ret < 0))
-
return crun_make_error (err, errno, "write to sync socket");
-
}
-
}
-
// 如果容器还要进行fork,则需要等待容器fork出来的进程id
-
if (init_status.must_fork)
-
{
-
pid_t grandchild = 0;
-
-
ret = expect_success_from_sync_socket (sync_socket_host, err);
-
if (UNLIKELY (ret < 0))
-
return ret;
-
-
ret = TEMP_FAILURE_RETRY (read (sync_socket_host, &grandchild, sizeof (grandchild)));
-
if (UNLIKELY (ret < 0))
-
return crun_make_error (err, errno, "read pid from sync socket");
-
-
/* Cleanup the first process. */
-
waitpid (pid, NULL, 0);
-
-
pid = grandchild;
-
}
-
-
ret = expect_success_from_sync_socket (sync_socket_host, err);
-
if (UNLIKELY (ret < 0))
-
return ret;
-
-
*sync_socket_out = get_and_reset (&sync_socket_host);
-
// 返回最终容器的进程id
-
return pid;
-
}
-
-
/* Inside the container process. */
-
// 容器进程关闭主进程的不用的通信套接字
-
ret = close_and_reset (&sync_socket_host);
-
if (UNLIKELY (ret < 0))
-
return crun_make_error (err, errno, "close");
-
-
// 初始化一个命名空间相关的(添加命名空间)
-
ret = init_container (container, sync_socket_container, &init_status, err);
-
if (UNLIKELY (ret < 0))
-
{
-
char failure = 1;
-
-
ret = TEMP_FAILURE_RETRY (write (sync_socket_container, &failure, 1));
-
if (UNLIKELY (ret < 0))
-
goto localfail;
-
-
send_error_to_sync_socket_and_die (sync_socket_container, false, err);
-
-
localfail:
-
libcrun_fail_with_error ((*err)->status, "%s", (*err)->msg);
-
_exit (EXIT_FAILURE);
-
}
-
else
-
{
-
char success = 0;
-
-
ret = TEMP_FAILURE_RETRY (write (sync_socket_container, &success, 1));
-
if (UNLIKELY (ret < 0))
-
return ret;
-
}
-
-
/* Jump into the specified entrypoint. */
-
if (container->context->notify_socket)
-
xasprintf (¬ify_socket_env, "NOTIFY_SOCKET=%s/notify", container->context->notify_socket);
-
// 执行容器的真正初始化,即container_init
-
entrypoint (args, notify_socket_env, sync_socket_container, err);
-
-
/* ENTRYPOINT returns only on an error, fallback here: */
-
if (*err)
-
libcrun_fail_with_error ((*err)->status, "%s", (*err)->msg);
-
_exit (EXIT_FAILURE);
-
}
容器进程初始化,container_init实现如下:
点击(此处)折叠或打开
-
static int container_init (void *args, char *notify_socket, int sync_socket,
-
libcrun_error_t *err)
-
{
-
struct container_entrypoint_s *entrypoint_args = args;
-
int ret;
-
runtime_spec_schema_config_schema *def = entrypoint_args->container->container_def;
-
cleanup_free const char *exec_path = NULL;
-
cleanup_free char *notify_socket_cleanup = notify_socket;
-
-
entrypoint_args->sync_socket = sync_socket;
-
//改变日志输出fd到syncsocket
-
crun_set_output_handler (log_write_to_sync_socket, args, false);
-
// 初始化容器,一些组件的设置
-
ret = container_init_setup (args, notify_socket, sync_socket, &exec_path, err);
-
if (UNLIKELY (ret < 0))
-
{
-
/* If it fails to write the error using the sync socket, then fallback
-
to stderr. */
-
if (sync_socket_write_error (sync_socket, err) < 0)
-
return ret;
-
-
crun_error_release (err);
-
return ret;
-
}
-
-
entrypoint_args->sync_socket = -1;
-
//解除信号阻塞
-
ret = unblock_signals (err);
-
if (UNLIKELY (ret < 0))
-
return ret;
-
-
/* sync 4. */ 通知主进程,可以进行第4步初始化
-
ret = sync_socket_send_sync (sync_socket, false, err);
-
if (UNLIKELY (ret < 0))
-
return ret;
-
-
close_and_reset (&sync_socket);
-
// 如果exec_wait_fd>=0表示 这是create调用, 则会在这里阻塞,知道命令行调用crun start来解除
-
if (entrypoint_args->context->fifo_exec_wait_fd >= 0)
-
{
-
char buffer[1];
-
fd_set read_set;
-
cleanup_close int fd = entrypoint_args->context->fifo_exec_wait_fd;
-
entrypoint_args->context->fifo_exec_wait_fd = -1;
-
-
FD_ZERO (&read_set);
-
FD_SET (fd, &read_set);
-
do
-
{
-
ret = select (fd + 1, &read_set, NULL, NULL, NULL);
-
if (UNLIKELY (ret < 0))
-
return crun_make_error (err, errno, "select");
-
-
ret = TEMP_FAILURE_RETRY (read (fd, buffer, sizeof (buffer)));
-
if (UNLIKELY (ret < 0))
-
return crun_make_error (err, errno, "read from the exec fifo");
-
}
-
while (ret == 0);
-
-
close_and_reset (&entrypoint_args->context->fifo_exec_wait_fd);
-
}
-
// 设置打印输出到标准错误
-
crun_set_output_handler (log_write_to_stderr, NULL, false);
-
-
if (def->process && def->process->no_new_privileges)
-
{
-
char **seccomp_flags = NULL;
-
size_t seccomp_flags_len = 0;
-
-
if (def->linux && def->linux->seccomp)
-
{
-
seccomp_flags = def->linux->seccomp->flags;
-
seccomp_flags_len = def->linux->seccomp->flags_len;
-
}
-
// 设置 系统调用检查配置
-
ret = libcrun_apply_seccomp (entrypoint_args->seccomp_fd, seccomp_flags, seccomp_flags_len, err);
-
if (UNLIKELY (ret < 0))
-
return ret;
-
close_and_reset (&entrypoint_args->seccomp_fd);
-
}
-
-
if (UNLIKELY (def->process == NULL))
-
return crun_make_error (err, 0, "block 'process' not found");
-
-
if (UNLIKELY (exec_path == NULL))
-
return crun_make_error (err, 0, "executable path not specified");
-
// 运行starting hook函数
-
if (def->hooks && def->hooks->start_container_len)
-
{
-
libcrun_container_t *container = entrypoint_args->container;
-
-
ret = do_hooks (def, 0, container->context->id, false, NULL, "starting",
-
(hook **) def->hooks->start_container,
-
def->hooks->start_container_len,
-
entrypoint_args->hooks_out_fd,
-
entrypoint_args->hooks_err_fd,
-
err);
-
if (UNLIKELY (ret != 0))
-
return ret;
-
-
/* Seek stdout/stderr to the end. If the hooks were using the same files,
-
the container process overwrites what was previously written. */
-
(void) lseek (1, 0, SEEK_END);
-
(void) lseek (2, 0, SEEK_END);
-
}
-
// 执行容器里面的app,这里开始,用户程序开始运行
-
execv (exec_path, def->process->args);
-
-
if (errno == ENOENT)
-
return crun_make_error (err, errno, "exec container process (missing dynamic library?) `%s`", exec_path);
-
-
return crun_make_error (err, errno, "exec container process `%s`", exec_path);
-
}
container_init_setup函数实现如下:
点击(此处)折叠或打开
-
static int
-
container_init_setup (void *args, char *notify_socket,
-
int sync_socket, const char **exec_path,
-
libcrun_error_t *err)
-
{
-
struct container_entrypoint_s *entrypoint_args = args;
-
libcrun_container_t *container = entrypoint_args->container;
-
int ret;
-
int has_terminal;
-
cleanup_close int console_socket = -1;
-
cleanup_close int console_socketpair = -1;
-
runtime_spec_schema_config_schema *def = container->container_def;
-
runtime_spec_schema_config_schema_process_capabilities *capabilities;
-
cleanup_free char *rootfs = NULL;
-
int no_new_privs;
-
// 初始化selinux和apparmor 安全相关的
-
ret = initialize_security (def->process, err);
-
if (UNLIKELY (ret < 0))
-
return ret;
-
// UP配置回环网络
-
ret = libcrun_configure_network (container, err);
-
if (UNLIKELY (ret < 0))
-
return ret;
-
// 得到roofs的实际路径
-
rootfs = realpath (def->root->path, NULL);
-
if (UNLIKELY (rootfs == NULL))
-
{
-
/* If realpath failed for any reason, try the relative directory. */
-
rootfs = xstrdup (def->root->path);
-
}
-
// 得到终端输入输出fd
-
if (entrypoint_args->terminal_socketpair[0] >= 0)
-
{
-
close_and_reset (&entrypoint_args->terminal_socketpair[0]);
-
console_socketpair = entrypoint_args->terminal_socketpair[1];
-
}
-
-
/* sync 1. */ 等待主进程通知可以开始第1步初始化
-
ret = sync_socket_wait_sync (NULL, sync_socket, false, err);
-
if (UNLIKELY (ret < 0))
-
return ret;
-
-
has_terminal = container->container_def->process && container->container_def->process->terminal;
-
if (has_terminal && entrypoint_args->context->console_socket)
-
console_socket = entrypoint_args->console_socket_fd;
-
// 配置sysctl参数
-
ret = libcrun_set_sysctl (container, err);
-
if (UNLIKELY (ret < 0))
-
return ret;
-
// 挂载需要的文件系统(包括新的rootfs)
-
ret = libcrun_set_mounts (container, rootfs, err);
-
if (UNLIKELY (ret < 0))
-
return ret;
-
-
/* sync 2. */ 通知主进程,可以开始2阶段初始化
-
ret = sync_socket_send_sync (sync_socket, false, err);
-
if (UNLIKELY (ret < 0))
-
return ret;
-
-
/* sync 3. */ 等待主进程通知,可以进行第3阶段初始化
-
ret = sync_socket_wait_sync (NULL, sync_socket, false, err);
-
if (UNLIKELY (ret < 0))
-
return ret;
-
// 执行created hook
-
if (def->hooks && def->hooks->create_container_len)
-
{
-
ret = do_hooks (def, 0, container->context->id, false, NULL, "created",
-
(hook **) def->hooks->create_container,
-
def->hooks->create_container_len,
-
entrypoint_args->hooks_out_fd,
-
entrypoint_args->hooks_err_fd,
-
err);
-
if (UNLIKELY (ret != 0))
-
return ret;
-
}
-
// 设置selinux相关的label
-
if (def->process)
-
{
-
ret = libcrun_set_selinux_exec_label (def->process, err);
-
if (UNLIKELY (ret < 0))
-
return ret;
-
-
ret = libcrun_set_apparmor_profile (def->process, err);
-
if (UNLIKELY (ret < 0))
-
return ret;
-
}
-
// 关闭多余的fd套件字
-
ret = close_fds_ge_than (entrypoint_args->context->preserve_fds + 3, err);
-
if (UNLIKELY (ret < 0))
-
crun_error_write_warning_and_release (entrypoint_args->context->output_handler_arg, &err);
-
// 切换到新的rootfs去工作
-
ret = libcrun_do_pivot_root (container, entrypoint_args->context->no_pivot, rootfs, err);
-
if (UNLIKELY (ret < 0))
-
return ret;
-
// 重新使用/dev/null
-
ret = libcrun_reopen_dev_null (err);
-
if (UNLIKELY (ret < 0))
-
return ret;
-
-
if (clearenv ())
-
return crun_make_error (err, errno, "clearenv");
-
// 上面清除了环境变量, 这里重新设置新的环境变量
-
if (def->process)
-
{
-
size_t i;
-
-
for (i = 0; i < def->process->env_len; i++)
-
if (putenv (def->process->env[i]) < 0)
-
return crun_make_error (err, errno, "putenv `%s`", def->process->env[i]);
-
}
-
-
if (getenv ("HOME") == NULL)
-
{
-
ret = set_home_env (container->container_uid);
-
if (UNLIKELY (ret < 0 && errno != ENOTSUP))
-
{
-
setenv("HOME", "/", 1);
-
libcrun_warning ("cannot detect HOME environment variable, setting default");
-
}
-
}
-
-
if (def->process && def->process->cwd)
-
if (UNLIKELY (chdir (def->process->cwd) < 0))
-
return crun_make_error (err, errno, "chdir");
-
// 查找到用户程序路径
-
if (def->process && def->process->args)
-
{
-
*exec_path = find_executable (def->process->args[0], def->process->cwd);
-
if (UNLIKELY (*exec_path == NULL))
-
{
-
if (errno == ENOENT)
-
return crun_make_error (err, errno, "executable file not found in $PATH");
-
-
return crun_make_error (err, errno, "open executable");
-
}
-
}
-
-
ret = setsid ();
-
if (UNLIKELY (ret < 0))
-
return crun_make_error (err, errno, "setsid");
-
// 如果有终端,将终端fd通知主进程
-
if (has_terminal)
-
{
-
cleanup_close int terminal_fd = -1;
-
-
fflush (stderr);
-
-
terminal_fd = libcrun_set_terminal (container, err);
-
if (UNLIKELY (terminal_fd < 0))
-
return terminal_fd;
-
-
if (console_socket >= 0)
-
{
-
ret = send_fd_to_socket (console_socket, terminal_fd, err);
-
if (UNLIKELY (ret < 0))
-
return ret;
-
close_and_reset (&console_socket);
-
}
-
else if (entrypoint_args->has_terminal_socket_pair && console_socketpair >= 0)
-
{
-
ret = send_fd_to_socket (console_socketpair, terminal_fd, err);
-
if (UNLIKELY (ret < 0))
-
return ret;
-
-
close_and_reset (&console_socketpair);
-
}
-
}
-
// 设置容器主机名字
-
ret = libcrun_set_hostname (container, err);
-
if (UNLIKELY (ret < 0))
-
return ret;
-
-
if (container->container_def->linux && container->container_def->linux->personality)
-
{
-
ret = libcrun_set_personality (container->container_def->linux->personality, err);
-
if (UNLIKELY (ret < 0))
-
return ret;
-
}
-
-
if (def->process->user)
-
umask (def->process->user->umask_present ? def->process->user->umask : 0022);
-
// 设置准备好的读取seccomp.bpf规则到系统调用安全检查
-
if (def->process && !def->process->no_new_privileges)
-
{
-
char **seccomp_flags = NULL;
-
size_t seccomp_flags_len = 0;
-
-
if (def->linux && def->linux->seccomp)
-
{
-
seccomp_flags = def->linux->seccomp->flags;
-
seccomp_flags_len = def->linux->seccomp->flags_len;
-
}
-
-
ret = libcrun_apply_seccomp (entrypoint_args->seccomp_fd, seccomp_flags, seccomp_flags_len, err);
-
if (UNLIKELY (ret < 0))
-
return ret;
-
-
close_and_reset (&entrypoint_args->seccomp_fd);
-
}
-
-
capabilities = def->process ? def->process->capabilities : NULL;
-
no_new_privs = def->process ? def->process->no_new_privileges : 1;
-
ret = libcrun_set_caps (capabilities, container->container_uid, container->container_gid, no_new_privs, err);
-
if (UNLIKELY (ret < 0))
-
return ret;
-
-
if (notify_socket)
-
{
-
if (putenv (notify_socket) < 0)
-
return crun_make_error (err, errno, "putenv `%s`", notify_socket);
-
}
-
-
return 0;
-
}
wait_for_process等待函数实现如下:
点击(此处)折叠或打开
-
static int wait_for_process (pid_t pid, libcrun_context_t *context, int terminal_fd, int notify_socket, int container_ready_fd, libcrun_error_t *err)
-
{
-
cleanup_close int epollfd = -1;
-
cleanup_close int signalfd = -1;
-
int ret, container_exit_code = 0, last_process;
-
sigset_t mask;
-
int fds[10];
-
int levelfds[10];
-
int levelfds_len = 0;
-
int fds_len = 0;
-
-
container_exit_code = 0;
-
-
if (context->pid_file)
-
{
-
char buf[12];
-
size_t buf_len = sprintf (buf, "%d", pid);
-
ret = write_file (context->pid_file, buf, buf_len, err);
-
if (UNLIKELY (ret < 0))
-
return ret;
-
}
-
-
/* crun create 将在这里返回,而crun run将会继续运行. */
-
if (context->detach && notify_socket < 0)
-
return 0;
-
-
if (container_ready_fd >= 0)
-
{
-
ret = 0;
-
TEMP_FAILURE_RETRY (write (container_ready_fd, &ret, sizeof (ret)));
-
close_and_reset (&container_ready_fd);
-
}
-
-
sigfillset (&mask);
-
ret = sigprocmask (SIG_BLOCK, &mask, NULL);
-
if (UNLIKELY (ret < 0))
-
return crun_make_error (err, errno, "sigprocmask");
-
// 将所有发送给当前主进程的信号转换为fd。
-
signalfd = create_signalfd (&mask, err);
-
if (UNLIKELY (signalfd < 0))
-
return signalfd;
-
// 等待一个子进程退出
-
ret = reap_subprocesses (pid, &container_exit_code, &last_process, err);
-
if (UNLIKELY (ret < 0))
-
return ret;
-
-
if (last_process)
-
return container_exit_code;
-
// 将终端和systemd的fd加入到select
-
fds[fds_len++] = signalfd;
-
if (notify_socket >= 0)
-
fds[fds_len++] = notify_socket;
-
if (terminal_fd >= 0)
-
{
-
fds[fds_len++] = 0;
-
levelfds[levelfds_len++] = terminal_fd;
-
}
-
fds[fds_len++] = -1;
-
levelfds[levelfds_len++] = -1;
-
// 创建epoll
-
epollfd = epoll_helper (fds, levelfds, err);
-
if (UNLIKELY (epollfd < 0))
-
return epollfd;
-
-
while (1)
-
{
-
struct signalfd_siginfo si;
-
ssize_t res;
-
struct epoll_event events[10];
-
int i, nr_events;
-
-
nr_events = TEMP_FAILURE_RETRY (epoll_wait (epollfd, events, 10, -1));
-
if (UNLIKELY (nr_events < 0))
-
return crun_make_error (err, errno, "epoll_wait");
-
-
for (i = 0; i < nr_events; i++)
-
{ // 这里做了一个转发,这样crun run可以实时显示容器的输入输出了
-
if (events[i].data.fd == 0)
-
{
-
ret = copy_from_fd_to_fd (0, terminal_fd, 0, err);
-
if (UNLIKELY (ret < 0))
-
return crun_error_wrap (err, "copy to terminal fd");
-
}
-
else if (events[i].data.fd == terminal_fd)
-
{
-
ret = set_blocking_fd (terminal_fd, 0, err);
-
if (UNLIKELY (ret < 0))
-
return crun_error_wrap (err, "set terminal fd not blocking");
-
-
ret = copy_from_fd_to_fd (terminal_fd, 1, 1, err);
-
if (UNLIKELY (ret < 0))
-
return crun_error_wrap (err, "copy from terminal fd");
-
-
ret = set_blocking_fd (terminal_fd, 1, err);
-
if (UNLIKELY (ret < 0))
-
return crun_error_wrap (err, "set terminal fd blocking");
-
}// 如果是给systemd的
-
else if (events[i].data.fd == notify_socket)
-
{
-
ret = handle_notify_socket (notify_socket, err);
-
if (UNLIKELY (ret < 0))
-
return ret;
-
if (ret && context->detach)
-
return 0;
-
} // 接收的的信号
-
else if (events[i].data.fd == signalfd)
-
{
-
res = TEMP_FAILURE_RETRY (read (signalfd, &si, sizeof (si)));
-
if (UNLIKELY (res < 0))
-
return crun_make_error (err, errno, "read from signalfd");
-
if (si.ssi_signo == SIGCHLD)
-
{// 表示容器退出,crun run 返回
-
ret = reap_subprocesses (pid, &container_exit_code, &last_process, err);
-
if (UNLIKELY (ret < 0))
-
return ret;
-
if (last_process)
-
return container_exit_code;
-
}
-
else
-
{
-
/* Send any other signal to the child process. */
-
ret = kill (pid, si.ssi_signo);
-
}
-
}
-
else
-
{
-
return crun_make_error (err, 0, "unknown fd from epoll_wait");
-
}
-
}
-
}
-
-
return 0;
-
}
这里贴出C代码编写的容器运行时crun的程序调用栈。
另外crun, 大概涉及到systemd, seccomp, sysctl, oom, keyring, rlimit, apparmor,selinux, cgroup, namespace(UTS, IPC, PID, NET,MOUNT,USER)等与Linux强相关的组件。
创建容器: crun create container_id (当前目录下必须有config.json和rootfs)
运行容器: crun start container_id
删除容器: crun delete container_id
创建+运行容器: crun run container_id
查看进程id:crun ps container_id
阅读(937) | 评论(0) | 转发(0) |