蓝色是Linux下的可执行指令,绿色为守护进程,紫色为虚拟机, 下图为google的k8s的模型框图(google引出了CRI接口),
一个容器运行时,所需要的环境bundle包必须在bundle里面包含一个满足runtime规范的配置文件config.json和一个目录结构,通常是rootfs, 这个config.json可以通过runc spec生成,如下图,
注:运行时的配置文件config.json必须满足运行时规范:; 容器镜像必须满足镜像规范:https://github.com/opencontainers/image-spec, 其中runc/crun只需满足运行时规范。
main (int argc, char **argv)
libcrun_error_t err = NULL;
int ret, first_argument;
argp_program_version_hook = print_version;
// C库的参数解析函数,first_argument得到第一个自己解析的索引
argp_parse (&argp, argc, argv, ARGP_IN_ORDER, &first_argument, &arguments);
// 通过名字获取到操作函数,这里是create, ps, start, run , exec等函数操作
command = get_command (argv[first_argument]);
if (command == NULL)
libcrun_fail_with_error (0, "unknown command %s", argv[first_argument]);
// 执行对应的操作函数
ret = command->handler (&arguments, argc - first_argument, argv + first_argument, &err);
if (ret && err)
libcrun_fail_with_error (err->status, "%s", err->msg);
return ret;
int crun_command_create (struct crun_global_arguments *global_args, int argc, char **argv, libcrun_error_t *err)
int first_arg, ret;
libcrun_container_t *container;
cleanup_free char *bundle_cleanup = NULL;
crun_context.preserve_fds = 0;
// 同样的,调用C的解析函数,解析CREATE的子参数
argp_parse (&run_argp, argc, argv, ARGP_IN_ORDER, &first_arg, &crun_context);
crun_assert_n_args (argc - first_arg, 1, 1);
/* 确定bundle的路径. */
if (bundle)
if (bundle[0] != '/')
bundle_cleanup = realpath (bundle, NULL);
if (bundle_cleanup == NULL)
libcrun_fail_with_error (errno, "realpath `%s` failed", bundle);
bundle = bundle_cleanup;
if (chdir (bundle) < 0)
libcrun_fail_with_error (errno, "chdir `%s` failed", bundle);
// 初始化上下文环境
ret = init_libcrun_context (&crun_context, argv[first_arg], global_args, err);
if (UNLIKELY (ret < 0))
return ret;
// 解析 config.json数据,并保存到container->def结构当中
container = libcrun_container_load_from_file (config_file, err);
if (container == NULL)
libcrun_fail_with_error (0, "error loading config.json");
crun_context.bundle = bundle ? bundle : ".";
if (getenv ("LISTEN_FDS"))
crun_context.preserve_fds += strtoll (getenv ("LISTEN_FDS"), NULL, 10);
// 通过配置文件和上下文,创建一个容器
return libcrun_container_create (&crun_context, container, 0, err);
libcrun_container_create (libcrun_context_t *context, libcrun_container_t *container, unsigned int options, libcrun_error_t *err)
runtime_spec_schema_config_schema *def = container->container_def;
int ret;
int container_ready_pipe[2];
cleanup_close int pipefd0 = -1;
cleanup_close int pipefd1 = -1;
cleanup_close int exec_fifo_fd = -1;
context->detach = 1; // 为1表示,crun create 指令不阻塞,直接返回(只有crun run这个detach是0)
container->context = context;
// 检查oci版本
if (def->oci_version && strstr (def->oci_version, "1.0") == NULL)
return crun_make_error (err, 0, "unknown version specified");
// 检查配置文件
ret = check_config_file (def, err);
if (UNLIKELY (ret < 0))
return ret;
if (def->process && def->process->terminal && context->console_socket == NULL)
return crun_make_error (err, 0, "use --console-socket with create when a terminal is used");
// 检查运行时目录,通常为/var/run/crun/*
ret = libcrun_status_check_directories (context->state_root, context->id, err);
if (UNLIKELY (ret < 0))
return ret;
// 创建执行等待fifo,主要是给create/start组合使用,run指令没有这个技能,即start会触发fifo让容器运行
exec_fifo_fd = libcrun_status_create_exec_fifo (context->state_root, context->id, err);
if (UNLIKELY (exec_fifo_fd < 0))
return exec_fifo_fd;
context->fifo_exec_wait_fd = exec_fifo_fd;
exec_fifo_fd = -1;
if ((options & LIBCRUN_RUN_OPTIONS_PREFORK) == 0)
// 将config.json从bundle拷贝到工作目录
ret = libcrun_copy_config_file (context->id, context->state_root, context->bundle, err);
if (UNLIKELY (ret < 0))
return ret;
// 创建容器环境,并运行容器
ret = libcrun_container_run_internal (container, context, -1, err);
if (UNLIKELY (ret < 0))
force_delete_container_status (context, def);
return ret;
static int
libcrun_container_run_internal (libcrun_container_t *container, libcrun_context_t *context, int container_ready_fd, libcrun_error_t *err)
runtime_spec_schema_config_schema *def = container->container_def;
int ret;
pid_t pid;
int detach = context->detach;
cleanup_free char *cgroup_path = NULL;
cleanup_free char *scope = NULL;
cleanup_close int terminal_fd = -1;
cleanup_terminal void *orig_terminal = NULL;
cleanup_close int sync_socket = -1;
cleanup_close int notify_socket = -1;
cleanup_close int socket_pair_0 = -1;
cleanup_close int socket_pair_1 = -1;
cleanup_close int seccomp_fd = -1;
cleanup_close int console_socket_fd = -1;
cleanup_close int hooks_out_fd = -1;
cleanup_close int hooks_err_fd = -1;
int cgroup_mode, cgroup_manager;
char created[35];
uid_t root_uid = -1;
gid_t root_gid = -1;
struct container_entrypoint_s container_args =
.container = container,
.context = context,
.terminal_socketpair = {-1, -1},
.console_socket_fd = -1,
.hooks_out_fd = -1,
.hooks_err_fd = -1,
if (def->hooks && (def->hooks->prestart_len
|| def->hooks->poststart_len
|| def->hooks->create_runtime_len
|| def->hooks->create_container_len
|| def->hooks->start_container_len))
// 打开hook的输入输出fd
ret = open_hooks_output (container, &hooks_out_fd, &hooks_err_fd, err);
if (UNLIKELY (ret < 0))
return ret;
container_args.hooks_out_fd = hooks_out_fd;
container_args.hooks_err_fd = hooks_err_fd;
container->context = context;
if (!detach || context->notify_socket)
ret = prctl (PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0);
if (UNLIKELY (ret < 0))
return crun_make_error (err, errno, "set child subreaper");
if (!context->no_new_keyring)
{ // 创建keyring
ret = libcrun_create_keyring (container->context->id, err);
if (UNLIKELY (ret < 0))
return ret;
if (def->process && def->process->terminal && !detach && context->console_socket == NULL)
container_args.has_terminal_socket_pair = 1;
ret = create_socket_pair (container_args.terminal_socketpair, err);
if (UNLIKELY (ret < 0))
return crun_error_wrap (err, "create terminal socket");
socket_pair_0 = container_args.terminal_socketpair[0];
socket_pair_1 = container_args.terminal_socketpair[1];
// 设置信号为block
ret = block_signals (err);
if (UNLIKELY (ret < 0))
return ret;
if (def->linux && def->linux->seccomp)
{ // 创建seccomp.bpf, 用于系统调用安全检查
ret = open_seccomp_output (context->id, &seccomp_fd, false, context->state_root, err);
if (UNLIKELY (ret < 0))
return ret;
container_args.seccomp_fd = seccomp_fd;
if (context->console_socket)
{// 如果指定了本地socket, 则用本地socket作为容器的标准输入输出
console_socket_fd = open_unix_domain_client_socket (context->console_socket, 0, err);
if (UNLIKELY (console_socket_fd < 0))
return crun_error_wrap (err, "open console socket");
container_args.console_socket_fd = console_socket_fd;
// 获取cgroup版本模式等
cgroup_mode = libcrun_get_cgroup_mode (err);
if (cgroup_mode < 0)
return cgroup_mode;
// 运行一个linux容器,容器运行后,调用container_init函数初始化
pid = libcrun_run_linux_container (container, container_init, &container_args,
&sync_socket, err);
if (UNLIKELY (pid < 0))
return pid;
if (context->fifo_exec_wait_fd < 0 && context->notify_socket)
/* Do not open the notify socket here on "create". "start" will take care of it. */
ret = get_notify_fd (context, container, ?ify_socket, err);
if (UNLIKELY (ret < 0))
return ret;
if (container_args.terminal_socketpair[1] >= 0)
close_and_reset (&socket_pair_1);
if (context->systemd_cgroup)
cgroup_manager = CGROUP_MANAGER_SYSTEMD;
else if (context->force_no_cgroup)
get_root_in_the_userns_for_cgroups (def, container->host_uid, container->host_gid, &root_uid, &root_gid);
struct libcrun_cgroup_args cg =
.resources = def->linux ? def->linux->resources : NULL,
.annotations = def->annotations,
.cgroup_mode = cgroup_mode,
.path = &cgroup_path,
.scope = &scope,
.cgroup_path = def->linux ? def->linux->cgroups_path : "",
.manager = cgroup_manager,
.pid = pid,
.root_uid = root_uid,
.root_gid = root_gid,
.id = context->id,
.systemd_subgroup = find_systemd_subgroup (container, cgroup_mode),
// 打开cgroup
ret = libcrun_cgroup_enter (&cg, err);
if (UNLIKELY (ret < 0))
return cleanup_watch (context, pid, sync_socket, terminal_fd, err);
// 配置cgroup
if (def->linux && def->linux->resources)
ret = libcrun_update_cgroup_resources (cgroup_mode,
cgroup_path, err);
if (UNLIKELY (ret < 0))
return cleanup_watch (context, pid, sync_socket, terminal_fd, err);
/* sync 1. */ 通知容器可以进行1初始化
ret = sync_socket_send_sync (sync_socket, true, err);
if (UNLIKELY (ret < 0))
return cleanup_watch (context, pid, sync_socket, terminal_fd, err);
/* sync 2. */等待容器通知,可以进行2初始化
ret = sync_socket_wait_sync (context, sync_socket, false, err);
if (UNLIKELY (ret < 0))
return cleanup_watch (context, pid, sync_socket, terminal_fd, err);
// 执行hook函数
if (def->hooks && def->hooks->prestart_len)
ret = do_hooks (def, pid, context->id, false, NULL, "created",
(hook **) def->hooks->prestart,
def->hooks->prestart_len, hooks_out_fd, hooks_err_fd, err);
if (UNLIKELY (ret != 0))
return cleanup_watch (context, pid, sync_socket, terminal_fd, err);
if (def->hooks && def->hooks->create_runtime_len)
ret = do_hooks (def, pid, context->id, false, NULL, "created",
(hook **) def->hooks->create_runtime,
def->hooks->create_runtime_len, hooks_out_fd, hooks_err_fd, err);
if (UNLIKELY (ret != 0))
return cleanup_watch (context, pid, sync_socket, terminal_fd, err);
// 如果支持系统调用安全检查,这里就产生一个bpf文件
if (seccomp_fd >= 0)
unsigned int seccomp_gen_options = 0;
const char *annotation;
annotation = find_annotation (container, "run.oci.seccomp_fail_unknown_syscall");
if (annotation && strcmp (annotation, "0") != 0)
ret = libcrun_generate_seccomp (container, seccomp_fd, seccomp_gen_options, err);
if (UNLIKELY (ret < 0))
return cleanup_watch (context, pid, sync_socket, terminal_fd, err);
close_and_reset (&seccomp_fd);
/* sync 3. */ 通知容器进程,可以进行3初始化
ret = sync_socket_send_sync (sync_socket, true, err);
if (UNLIKELY (ret < 0))
return cleanup_watch (context, pid, sync_socket, terminal_fd, err);
if (def->process && def->process->terminal && !detach && context->console_socket == NULL)
{ // 如果没有指定本地socket作为容器的终端, 这里创建一个
terminal_fd = receive_fd_from_socket (socket_pair_0, err);
if (UNLIKELY (terminal_fd < 0))
return cleanup_watch (context, pid, sync_socket, terminal_fd, err);
close_and_reset (&socket_pair_0);
// 设置socket的teminal属性
ret = libcrun_setup_terminal_master (terminal_fd, &orig_terminal, err);
if (UNLIKELY (ret < 0))
return cleanup_watch (context, pid, sync_socket, terminal_fd, err);
/* sync 4. */ 等待容器通知,可以进行4初始化
ret = sync_socket_wait_sync (context, sync_socket, false, err);
if (UNLIKELY (ret < 0))
return cleanup_watch (context, pid, sync_socket, terminal_fd, err);
ret = close_and_reset (&sync_socket);
if (UNLIKELY (ret < 0))
return cleanup_watch (context, pid, sync_socket, terminal_fd, err);
get_current_timestamp (created);
// 写容器状态,pid是容器的进程id
ret = write_container_status (container, context, pid, cgroup_path, scope, created, err);
if (UNLIKELY (ret < 0))
return cleanup_watch (context, pid, sync_socket, terminal_fd, err);
// 执行running hook函数
if (context->fifo_exec_wait_fd < 0 && def->hooks && def->hooks->poststart_len)
ret = do_hooks (def, pid, context->id, true, NULL, "running",
(hook **) def->hooks->poststart,
def->hooks->poststart_len, hooks_out_fd, hooks_err_fd, err);
if (UNLIKELY (ret < 0))
return cleanup_watch (context, pid, sync_socket, terminal_fd, err);
// 如果detach是true则return;这个wait主要给crun run使用, 对于create就是一个空函数
ret = wait_for_process (pid, context, terminal_fd, notify_socket, container_ready_fd, err);
if (!context->detach)
cleanup_watch (context, 0, sync_socket, terminal_fd, err);
crun_error_release (err);
return ret;
libcrun_run_linux_container (libcrun_container_t *container,
container_entrypoint_t entrypoint,
void *args,
int *sync_socket_out,
libcrun_error_t *err)
__attribute__((cleanup (cleanup_free_init_statusp))) struct init_status_s init_status;
runtime_spec_schema_config_schema *def = container->container_def;
cleanup_close int sync_socket_container = -1;
char *notify_socket_env = NULL;
cleanup_close int sync_socket_host = -1;
bool clone_can_create_userns;
int sync_socket[2];
pid_t pid;
size_t i;
int ret;
// 初始化命名空间,打开命名空间fd
ret = configure_init_status (&init_status, container, err);
if (UNLIKELY (ret < 0))
return ret;
get_private_data (container)->unshare_flags = init_status.all_namespaces;
/* cgroup will be unshared later. Once the process is in the correct cgroup. */
init_status.all_namespaces &= ~CLONE_NEWCGROUP;
// 创建 容器和主进程通信的socket
ret = socketpair (AF_UNIX, SOCK_SEQPACKET | SOCK_CLOEXEC, 0, sync_socket);
if (UNLIKELY (ret < 0))
return crun_make_error (err, errno, "socketpair");
sync_socket_host = sync_socket[0];
sync_socket_container = sync_socket[1];
get_uid_gid_from_def (container->container_def,
/* 设置rlimis值 */
if (def->process)
ret = libcrun_set_rlimits (def->process->rlimits, def->process->rlimits_len, err);
if (UNLIKELY (ret < 0))
return ret;
// 设置oom属性
ret = libcrun_set_oom (container, err);
if (UNLIKELY (ret < 0))
return ret;
// 配置命名空间相关,是否则在新的容器进程支持对应的命名空间
if ((init_status.all_namespaces & CLONE_NEWIPC) && (init_status.all_namespaces & CLONE_NEWUSER))
for (i = 0; i < init_status.fd_len; i++)
if (init_status.value[i] == CLONE_NEWIPC)
init_status.join_ipcns = true;
if (init_status.all_namespaces & CLONE_NEWPID)
init_status.must_fork = true;
for (i = 0; i < init_status.fd_len; i++)
if (init_status.value[i] == CLONE_NEWPID)
init_status.join_pidns = true;
if (setns (init_status.fd[i], CLONE_NEWPID) == 0)
init_status.namespaces_to_unshare &= ~CLONE_NEWPID;
init_status.must_fork = false;
close_and_reset (&init_status.fd[i]);
/* It creates a new PID namespace, without a user namespace, we can try to
join it immediately without another fork. */
if (i == init_status.fd_len && (init_status.all_namespaces & CLONE_NEWUSER) == 0)
if (unshare (CLONE_NEWPID) == 0)
init_status.namespaces_to_unshare &= ~CLONE_NEWPID;
init_status.must_fork = false;
if (init_status.all_namespaces & CLONE_NEWTIME)
init_status.must_fork = true;
clone_can_create_userns = init_status.fd_len == 0;
if ((init_status.all_namespaces & CLONE_NEWUSER) && init_status.userns_index < 0)
init_status.delayed_userns_create = !clone_can_create_userns || init_status.fd_len > 0;
// 创建一个容器进程
pid = syscall_clone ((init_status.namespaces_to_unshare & (clone_can_create_userns ? CLONE_NEWUSER : 0)) | SIGCHLD, NULL);
if (UNLIKELY (pid < 0))
return crun_make_error (err, errno, "clone");
if (clone_can_create_userns)
init_status.namespaces_to_unshare &= ~CLONE_NEWUSER;
if (pid)
{ // 主进程 保存容器额外的信息
ret = save_external_descriptors (container, pid, err);
if (UNLIKELY (ret < 0))
return ret;
// 关闭主进程不需要的socket,即容器用的socket
ret = close_and_reset (&sync_socket_container);
if (UNLIKELY (ret < 0))
return crun_make_error (err, errno, "close");
// 如果是clone_newuser,需要等待容器ready
if (init_status.all_namespaces & CLONE_NEWUSER)
if (init_status.delayed_userns_create)
ret = expect_success_from_sync_socket (sync_socket_host, err);
if (UNLIKELY (ret < 0))
return ret;
if (init_status.userns_index < 0)
ret = libcrun_set_usernamespace (container, pid, err);
if (UNLIKELY (ret < 0))
return ret;
ret = TEMP_FAILURE_RETRY (write (sync_socket_host, "1", 1));
if (UNLIKELY (ret < 0))
return crun_make_error (err, errno, "write to sync socket");
// 如果容器还要进行fork,则需要等待容器fork出来的进程id
if (init_status.must_fork)
pid_t grandchild = 0;
ret = expect_success_from_sync_socket (sync_socket_host, err);
if (UNLIKELY (ret < 0))
return ret;
ret = TEMP_FAILURE_RETRY (read (sync_socket_host, &grandchild, sizeof (grandchild)));
if (UNLIKELY (ret < 0))
return crun_make_error (err, errno, "read pid from sync socket");
/* Cleanup the first process. */
waitpid (pid, NULL, 0);
pid = grandchild;
ret = expect_success_from_sync_socket (sync_socket_host, err);
if (UNLIKELY (ret < 0))
return ret;
*sync_socket_out = get_and_reset (&sync_socket_host);
// 返回最终容器的进程id
return pid;
/* Inside the container process. */
// 容器进程关闭主进程的不用的通信套接字
ret = close_and_reset (&sync_socket_host);
if (UNLIKELY (ret < 0))
return crun_make_error (err, errno, "close");
// 初始化一个命名空间相关的(添加命名空间)
ret = init_container (container, sync_socket_container, &init_status, err);
if (UNLIKELY (ret < 0))
char failure = 1;
ret = TEMP_FAILURE_RETRY (write (sync_socket_container, &failure, 1));
if (UNLIKELY (ret < 0))
goto localfail;
send_error_to_sync_socket_and_die (sync_socket_container, false, err);
libcrun_fail_with_error ((*err)->status, "%s", (*err)->msg);
char success = 0;
ret = TEMP_FAILURE_RETRY (write (sync_socket_container, &success, 1));
if (UNLIKELY (ret < 0))
return ret;
/* Jump into the specified entrypoint. */
if (container->context->notify_socket)
xasprintf (¬ify_socket_env, "NOTIFY_SOCKET=%s/notify", container->context->notify_socket);
// 执行容器的真正初始化,即container_init
entrypoint (args, notify_socket_env, sync_socket_container, err);
/* ENTRYPOINT returns only on an error, fallback here: */
if (*err)
libcrun_fail_with_error ((*err)->status, "%s", (*err)->msg);
static int container_init (void *args, char *notify_socket, int sync_socket,
libcrun_error_t *err)
struct container_entrypoint_s *entrypoint_args = args;
int ret;
runtime_spec_schema_config_schema *def = entrypoint_args->container->container_def;
cleanup_free const char *exec_path = NULL;
cleanup_free char *notify_socket_cleanup = notify_socket;
entrypoint_args->sync_socket = sync_socket;
crun_set_output_handler (log_write_to_sync_socket, args, false);
// 初始化容器,一些组件的设置
ret = container_init_setup (args, notify_socket, sync_socket, &exec_path, err);
if (UNLIKELY (ret < 0))
/* If it fails to write the error using the sync socket, then fallback
to stderr. */
if (sync_socket_write_error (sync_socket, err) < 0)
return ret;
crun_error_release (err);
return ret;
entrypoint_args->sync_socket = -1;
ret = unblock_signals (err);
if (UNLIKELY (ret < 0))
return ret;
/* sync 4. */ 通知主进程,可以进行第4步初始化
ret = sync_socket_send_sync (sync_socket, false, err);
if (UNLIKELY (ret < 0))
return ret;
close_and_reset (&sync_socket);
// 如果exec_wait_fd>=0表示 这是create调用, 则会在这里阻塞,知道命令行调用crun start来解除
if (entrypoint_args->context->fifo_exec_wait_fd >= 0)
char buffer[1];
fd_set read_set;
cleanup_close int fd = entrypoint_args->context->fifo_exec_wait_fd;
entrypoint_args->context->fifo_exec_wait_fd = -1;
FD_ZERO (&read_set);
FD_SET (fd, &read_set);
ret = select (fd + 1, &read_set, NULL, NULL, NULL);
if (UNLIKELY (ret < 0))
return crun_make_error (err, errno, "select");
ret = TEMP_FAILURE_RETRY (read (fd, buffer, sizeof (buffer)));
if (UNLIKELY (ret < 0))
return crun_make_error (err, errno, "read from the exec fifo");
while (ret == 0);
close_and_reset (&entrypoint_args->context->fifo_exec_wait_fd);
// 设置打印输出到标准错误
crun_set_output_handler (log_write_to_stderr, NULL, false);
if (def->process && def->process->no_new_privileges)
char **seccomp_flags = NULL;
size_t seccomp_flags_len = 0;
if (def->linux && def->linux->seccomp)
seccomp_flags = def->linux->seccomp->flags;
seccomp_flags_len = def->linux->seccomp->flags_len;
// 设置 系统调用检查配置
ret = libcrun_apply_seccomp (entrypoint_args->seccomp_fd, seccomp_flags, seccomp_flags_len, err);
if (UNLIKELY (ret < 0))
return ret;
close_and_reset (&entrypoint_args->seccomp_fd);
if (UNLIKELY (def->process == NULL))
return crun_make_error (err, 0, "block 'process' not found");
if (UNLIKELY (exec_path == NULL))
return crun_make_error (err, 0, "executable path not specified");
// 运行starting hook函数
if (def->hooks && def->hooks->start_container_len)
libcrun_container_t *container = entrypoint_args->container;
ret = do_hooks (def, 0, container->context->id, false, NULL, "starting",
(hook **) def->hooks->start_container,
if (UNLIKELY (ret != 0))
return ret;
/* Seek stdout/stderr to the end. If the hooks were using the same files,
the container process overwrites what was previously written. */
(void) lseek (1, 0, SEEK_END);
(void) lseek (2, 0, SEEK_END);
// 执行容器里面的app,这里开始,用户程序开始运行
execv (exec_path, def->process->args);
if (errno == ENOENT)
return crun_make_error (err, errno, "exec container process (missing dynamic library?) `%s`", exec_path);
return crun_make_error (err, errno, "exec container process `%s`", exec_path);
static int
container_init_setup (void *args, char *notify_socket,
int sync_socket, const char **exec_path,
libcrun_error_t *err)
struct container_entrypoint_s *entrypoint_args = args;
libcrun_container_t *container = entrypoint_args->container;
int ret;
int has_terminal;
cleanup_close int console_socket = -1;
cleanup_close int console_socketpair = -1;
runtime_spec_schema_config_schema *def = container->container_def;
runtime_spec_schema_config_schema_process_capabilities *capabilities;
cleanup_free char *rootfs = NULL;
int no_new_privs;
// 初始化selinux和apparmor 安全相关的
ret = initialize_security (def->process, err);
if (UNLIKELY (ret < 0))
return ret;
// UP配置回环网络
ret = libcrun_configure_network (container, err);
if (UNLIKELY (ret < 0))
return ret;
// 得到roofs的实际路径
rootfs = realpath (def->root->path, NULL);
if (UNLIKELY (rootfs == NULL))
/* If realpath failed for any reason, try the relative directory. */
rootfs = xstrdup (def->root->path);
// 得到终端输入输出fd
if (entrypoint_args->terminal_socketpair[0] >= 0)
close_and_reset (&entrypoint_args->terminal_socketpair[0]);
console_socketpair = entrypoint_args->terminal_socketpair[1];
/* sync 1. */ 等待主进程通知可以开始第1步初始化
ret = sync_socket_wait_sync (NULL, sync_socket, false, err);
if (UNLIKELY (ret < 0))
return ret;
has_terminal = container->container_def->process && container->container_def->process->terminal;
if (has_terminal && entrypoint_args->context->console_socket)
console_socket = entrypoint_args->console_socket_fd;
// 配置sysctl参数
ret = libcrun_set_sysctl (container, err);
if (UNLIKELY (ret < 0))
return ret;
// 挂载需要的文件系统(包括新的rootfs)
ret = libcrun_set_mounts (container, rootfs, err);
if (UNLIKELY (ret < 0))
return ret;
/* sync 2. */ 通知主进程,可以开始2阶段初始化
ret = sync_socket_send_sync (sync_socket, false, err);
if (UNLIKELY (ret < 0))
return ret;
/* sync 3. */ 等待主进程通知,可以进行第3阶段初始化
ret = sync_socket_wait_sync (NULL, sync_socket, false, err);
if (UNLIKELY (ret < 0))
return ret;
// 执行created hook
if (def->hooks && def->hooks->create_container_len)
ret = do_hooks (def, 0, container->context->id, false, NULL, "created",
(hook **) def->hooks->create_container,
if (UNLIKELY (ret != 0))
return ret;
// 设置selinux相关的label
if (def->process)
ret = libcrun_set_selinux_exec_label (def->process, err);
if (UNLIKELY (ret < 0))
return ret;
ret = libcrun_set_apparmor_profile (def->process, err);
if (UNLIKELY (ret < 0))
return ret;
// 关闭多余的fd套件字
ret = close_fds_ge_than (entrypoint_args->context->preserve_fds + 3, err);
if (UNLIKELY (ret < 0))
crun_error_write_warning_and_release (entrypoint_args->context->output_handler_arg, &err);
// 切换到新的rootfs去工作
ret = libcrun_do_pivot_root (container, entrypoint_args->context->no_pivot, rootfs, err);
if (UNLIKELY (ret < 0))
return ret;
// 重新使用/dev/null
ret = libcrun_reopen_dev_null (err);
if (UNLIKELY (ret < 0))
return ret;
if (clearenv ())
return crun_make_error (err, errno, "clearenv");
// 上面清除了环境变量, 这里重新设置新的环境变量
if (def->process)
size_t i;
for (i = 0; i < def->process->env_len; i++)
if (putenv (def->process->env[i]) < 0)
return crun_make_error (err, errno, "putenv `%s`", def->process->env[i]);
if (getenv ("HOME") == NULL)
ret = set_home_env (container->container_uid);
if (UNLIKELY (ret < 0 && errno != ENOTSUP))
setenv("HOME", "/", 1);
libcrun_warning ("cannot detect HOME environment variable, setting default");
if (def->process && def->process->cwd)
if (UNLIKELY (chdir (def->process->cwd) < 0))
return crun_make_error (err, errno, "chdir");
// 查找到用户程序路径
if (def->process && def->process->args)
*exec_path = find_executable (def->process->args[0], def->process->cwd);
if (UNLIKELY (*exec_path == NULL))
if (errno == ENOENT)
return crun_make_error (err, errno, "executable file not found in $PATH");
return crun_make_error (err, errno, "open executable");
ret = setsid ();
if (UNLIKELY (ret < 0))
return crun_make_error (err, errno, "setsid");
// 如果有终端,将终端fd通知主进程
if (has_terminal)
cleanup_close int terminal_fd = -1;
fflush (stderr);
terminal_fd = libcrun_set_terminal (container, err);
if (UNLIKELY (terminal_fd < 0))
return terminal_fd;
if (console_socket >= 0)
ret = send_fd_to_socket (console_socket, terminal_fd, err);
if (UNLIKELY (ret < 0))
return ret;
close_and_reset (&console_socket);
else if (entrypoint_args->has_terminal_socket_pair && console_socketpair >= 0)
ret = send_fd_to_socket (console_socketpair, terminal_fd, err);
if (UNLIKELY (ret < 0))
return ret;
close_and_reset (&console_socketpair);
// 设置容器主机名字
ret = libcrun_set_hostname (container, err);
if (UNLIKELY (ret < 0))
return ret;
if (container->container_def->linux && container->container_def->linux->personality)
ret = libcrun_set_personality (container->container_def->linux->personality, err);
if (UNLIKELY (ret < 0))
return ret;
if (def->process->user)
umask (def->process->user->umask_present ? def->process->user->umask : 0022);
// 设置准备好的读取seccomp.bpf规则到系统调用安全检查
if (def->process && !def->process->no_new_privileges)
char **seccomp_flags = NULL;
size_t seccomp_flags_len = 0;
if (def->linux && def->linux->seccomp)
seccomp_flags = def->linux->seccomp->flags;
seccomp_flags_len = def->linux->seccomp->flags_len;
ret = libcrun_apply_seccomp (entrypoint_args->seccomp_fd, seccomp_flags, seccomp_flags_len, err);
if (UNLIKELY (ret < 0))
return ret;
close_and_reset (&entrypoint_args->seccomp_fd);
capabilities = def->process ? def->process->capabilities : NULL;
no_new_privs = def->process ? def->process->no_new_privileges : 1;
ret = libcrun_set_caps (capabilities, container->container_uid, container->container_gid, no_new_privs, err);
if (UNLIKELY (ret < 0))
return ret;
if (notify_socket)
if (putenv (notify_socket) < 0)
return crun_make_error (err, errno, "putenv `%s`", notify_socket);
return 0;
static int wait_for_process (pid_t pid, libcrun_context_t *context, int terminal_fd, int notify_socket, int container_ready_fd, libcrun_error_t *err)
cleanup_close int epollfd = -1;
cleanup_close int signalfd = -1;
int ret, container_exit_code = 0, last_process;
sigset_t mask;
int fds[10];
int levelfds[10];
int levelfds_len = 0;
int fds_len = 0;
container_exit_code = 0;
if (context->pid_file)
char buf[12];
size_t buf_len = sprintf (buf, "%d", pid);
ret = write_file (context->pid_file, buf, buf_len, err);
if (UNLIKELY (ret < 0))
return ret;
/* crun create 将在这里返回,而crun run将会继续运行. */
if (context->detach && notify_socket < 0)
return 0;
if (container_ready_fd >= 0)
ret = 0;
TEMP_FAILURE_RETRY (write (container_ready_fd, &ret, sizeof (ret)));
close_and_reset (&container_ready_fd);
sigfillset (&mask);
ret = sigprocmask (SIG_BLOCK, &mask, NULL);
if (UNLIKELY (ret < 0))
return crun_make_error (err, errno, "sigprocmask");
// 将所有发送给当前主进程的信号转换为fd。
signalfd = create_signalfd (&mask, err);
if (UNLIKELY (signalfd < 0))
return signalfd;
// 等待一个子进程退出
ret = reap_subprocesses (pid, &container_exit_code, &last_process, err);
if (UNLIKELY (ret < 0))
return ret;
if (last_process)
return container_exit_code;
// 将终端和systemd的fd加入到select
fds[fds_len++] = signalfd;
if (notify_socket >= 0)
fds[fds_len++] = notify_socket;
if (terminal_fd >= 0)
fds[fds_len++] = 0;
levelfds[levelfds_len++] = terminal_fd;
fds[fds_len++] = -1;
levelfds[levelfds_len++] = -1;
// 创建epoll
epollfd = epoll_helper (fds, levelfds, err);
if (UNLIKELY (epollfd < 0))
return epollfd;
while (1)
struct signalfd_siginfo si;
ssize_t res;
struct epoll_event events[10];
int i, nr_events;
nr_events = TEMP_FAILURE_RETRY (epoll_wait (epollfd, events, 10, -1));
if (UNLIKELY (nr_events < 0))
return crun_make_error (err, errno, "epoll_wait");
for (i = 0; i < nr_events; i++)
{ // 这里做了一个转发,这样crun run可以实时显示容器的输入输出了
if (events[i].data.fd == 0)
ret = copy_from_fd_to_fd (0, terminal_fd, 0, err);
if (UNLIKELY (ret < 0))
return crun_error_wrap (err, "copy to terminal fd");
else if (events[i].data.fd == terminal_fd)
ret = set_blocking_fd (terminal_fd, 0, err);
if (UNLIKELY (ret < 0))
return crun_error_wrap (err, "set terminal fd not blocking");
ret = copy_from_fd_to_fd (terminal_fd, 1, 1, err);
if (UNLIKELY (ret < 0))
return crun_error_wrap (err, "copy from terminal fd");
ret = set_blocking_fd (terminal_fd, 1, err);
if (UNLIKELY (ret < 0))
return crun_error_wrap (err, "set terminal fd blocking");
}// 如果是给systemd的
else if (events[i].data.fd == notify_socket)
ret = handle_notify_socket (notify_socket, err);
if (UNLIKELY (ret < 0))
return ret;
if (ret && context->detach)
return 0;
} // 接收的的信号
else if (events[i].data.fd == signalfd)
res = TEMP_FAILURE_RETRY (read (signalfd, &si, sizeof (si)));
if (UNLIKELY (res < 0))
return crun_make_error (err, errno, "read from signalfd");
if (si.ssi_signo == SIGCHLD)
{// 表示容器退出,crun run 返回
ret = reap_subprocesses (pid, &container_exit_code, &last_process, err);
if (UNLIKELY (ret < 0))
return ret;
if (last_process)
return container_exit_code;
/* Send any other signal to the child process. */
ret = kill (pid, si.ssi_signo);
return crun_make_error (err, 0, "unknown fd from epoll_wait");
return 0;
另外crun, 大概涉及到systemd, seccomp, sysctl, oom, keyring, rlimit, apparmor,selinux, cgroup, namespace(UTS, IPC, PID, NET,MOUNT,USER)等与Linux强相关的组件。
创建容器: crun create container_id (当前目录下必须有config.json和rootfs)
运行容器: crun start container_id
删除容器: crun delete container_id
创建+运行容器: crun run container_id
查看进程id:crun ps container_id
阅读(12194) | 评论(0) | 转发(1) |