同一款产品最近发现在reboot前对文件做的修改没有生效。之前是可以的。
首先可以明确的是,改动没有从缓存同步到磁盘就重启了。
对比新旧版本差异,发现新版本更换了更高的busybox1.17.2,而老版本使用的busybox1.01没有问题。比较reboot代码流程(init.c里面)发现代码改动还是蛮大的。
主要差别,个人认为有2点:
1)run_actions(SHUTDOWN);流程里面的waitpid改为wait,且wait后判断动作发生变化
1.01版本waitfor代码
点击(此处)折叠或打开
-
static int waitfor(const struct init_action *a)
-
{
-
int pid;
-
int status, wpid;
-
-
pid = run(a);
-
while (1) {
-
wpid = waitpid(pid,&status,0);
-
if (wpid == pid)
-
break;
-
if (wpid == -1 && errno == ECHILD) {
-
/* we missed its termination */
-
break;
-
}
-
/* FIXME other errors should maybe trigger an error, but allow
-
* the program to continue */
-
}
-
return wpid;
-
}
1.17.2版本waitfor代码
-
static void waitfor(pid_t pid)
-
{
-
/* waitfor(run(x)): protect against failed fork inside run() */
-
if (pid <= 0)
-
return;
-
-
/* Wait for any child (prevent zombies from exiting orphaned processes)
-
* but exit the loop only when specified one has exited. */
-
while (1) {
-
pid_t wpid = wait(NULL);
-
message(L_LOG | L_CONSOLE, "wait wpid:%d, pid:%d\n",wpid,pid);
-
mark_terminated(wpid);
-
/* Unsafe. SIGTSTP handler might have wait'ed it already */
-
/*if (wpid == pid) break;*/
-
/* More reliable: */
-
if (kill(pid, 0))
-
break;
-
}
-
}
这个变化很早就有了,担心wait的pid不是SHUTDOWN在inittab中指定的程序,加入log诊断。发现这块没有问题。
2)实施系统关闭的流程,调用
run_actions(SHUTDOWN);的实现变化
1.01中代码为:
-
static void shutdown_system(void)
-
{
-
sigset_t block_signals;
-
-
/* run everything to be run at "shutdown". This is done _prior_
-
* to killing everything, in case people wish to use scripts to
-
* shut things down gracefully... */
-
run_actions(SHUTDOWN);
-
-
/* first disable all our signals */
-
sigemptyset(&block_signals);
-
sigaddset(&block_signals, SIGHUP);
-
sigaddset(&block_signals, SIGQUIT);
-
sigaddset(&block_signals, SIGCHLD);
-
sigaddset(&block_signals, SIGUSR1);
-
sigaddset(&block_signals, SIGUSR2);
-
sigaddset(&block_signals, SIGINT);
-
sigaddset(&block_signals, SIGTERM);
-
sigaddset(&block_signals, SIGCONT);
-
sigaddset(&block_signals, SIGSTOP);
-
sigaddset(&block_signals, SIGTSTP);
-
sigprocmask(SIG_BLOCK, &block_signals, NULL);
-
-
/* Allow Ctrl-Alt-Del to reboot system. */
-
init_reboot(RB_ENABLE_CAD);
-
-
message(CONSOLE | LOG, "The system is going down NOW !!");
-
sync();
-
-
/* Send signals to every process _except_ pid 1 */
-
message(CONSOLE | LOG, "Sending SIGTERM to all processes.");
-
kill(-1, SIGTERM);
-
sleep(1);
-
sync();
-
-
message(CONSOLE | LOG, "Sending SIGKILL to all processes.");
-
kill(-1, SIGKILL);
-
sleep(1);
-
-
sync();
-
}
1.17.2代码为:
-
static void run_shutdown_and_kill_processes(void)
-
{
-
/* Run everything to be run at "shutdown". This is done _prior_
-
* to killing everything, in case people wish to use scripts to
-
* shut things down gracefully... */
-
message(L_CONSOLE | L_LOG, "SHUTDOWN!");
-
run_actions(SHUTDOWN);
-
-
message(L_CONSOLE | L_LOG, "The system is going down NOW!");
-
-
/* Send signals to every process _except_ pid 1 */
-
kill(-1, SIGTERM);
-
message(L_CONSOLE | L_LOG, "Sent SIG%s to all processes", "TERM");
-
sync();
-
sleep(1);
-
kill(-1, SIGKILL);
-
message(L_CONSOLE, "Sent SIG%s to all processes", "KILL");
-
sync();
-
/*sleep(1); - callers take care about making a pause */
-
}
差异有几个地方:
a)老版本kill前有进行信号屏蔽,新版本没有。担心kill影响,可以查看http://blog.csdn.net/zanget/article/details/6659838,对kill解释如下
-
int kill(pid_t pid, int sig);
-
-
1. pid>0时,pid是信号欲送往的进程的标识。
-
-
2. pid=0时,信号送往与调用kill()的进程属同一个使用组的进程
-
-
3. pid=-1时,信号将送往所有调用进程有权给其发送信号的进程,除了进程1(init)。
-
-
4. pid<-1时,信号将送往以-pid为组标识的进程。
b)减少了1个sleep(1),而注释为callers take care about making a pause。调用者要关注进行停顿。
后来,发现放开sleep(1);修改测试成功有所提高。最终测试可靠的修改如下
-
static void run_shutdown_and_kill_processes(void)
-
{
-
/* Run everything to be run at "shutdown". This is done _prior_
-
* to killing everything, in case people wish to use scripts to
-
* shut things down gracefully... */
-
run_actions(SHUTDOWN);
-
-
message(L_CONSOLE | L_LOG, "The system is going down NOW!");
-
sync();
-
sleep(2);
-
/* Send signals to every process _except_ pid 1 */
-
kill(-1, SIGTERM);
-
message(L_CONSOLE | L_LOG, "Sent SIG%s to all processes", "TERM");
-
sleep(1);
-
sync();
-
kill(-1, SIGKILL);
-
message(L_CONSOLE, "Sent SIG%s to all processes", "KILL");
-
sleep(1); /*- callers take care about making a pause */
-
message(L_CONSOLE, "Last Sync");
-
sync();
-
}
问题查证过程用了好几个VB测试脚本,最终选择如下:
-
#$language = "VBScript"
-
#$interface = "1.0"
-
-
Sub main
-
-
Dim Time
-
-
Do while (1)
-
crt.Screen.Send "root" & VbCr
-
crt.Sleep 500
-
crt.Screen.Send "root" & VbCr
-
crt.Screen.Send "" & VbCr
-
crt.Screen.Send "#"
-
crt.Screen.Send Now & VbCr
-
crt.Screen.Send "ls /userconfig -l" & VbCr
-
crt.Sleep 500
-
crt.Screen.Send "echo 1 > /userconfig/calmode" & VbCr
-
crt.Sleep 3500
-
crt.Screen.Send "rm /userconfig/calmode" & VbCr
-
crt.Screen.Send "" & VbCr
-
crt.Screen.Send "reboot" & VbCr
-
crt.Sleep 60000
-
crt.Screen.Send "" & VbCr
-
Loop
-
-
End Sub
中间停顿3500是3.5s,大于inode脏页时间。对应的一些缓存写入参考:
节选:linux内核有着非常强大的磁盘缓存机制,就是磁盘数据先不往磁盘直接读写而是直接操作缓存,待到一定条件满足的时候才读写磁盘,大致有几个参数:1.dirty_writeback_centisecs,这个参数表示内核刷新缓存的时间间 隔;2.dirty_expire_centisecs,这个参数表示一个inode在dirty状态停留的最长时间;
也尝试过在rm动作后停顿3.5s,测试发现还是存在小概率失败。
中对sync命令有解释,也可以man查看。也有看到讲linux已经实现sync的同步,但测试看不是这样,或许跟linux kernel版本有关。
-
buffer:为了解决写磁盘的效率
-
cache:为了解决读磁盘的效率
linux dirtypage回写时机
1 定时方式: 定时回写是基于这样的原则:/proc/sys/vm/dirty_writeback_centisecs的值表示多长时间会启动回写线程,由这个定时器启动的回写线程只回写在内存中为dirty时间超过(/proc/sys/vm/didirty_expire_centisecs / 100)秒的页(这个值默认是3000,也就是30秒),一般情况下dirty_writeback_centisecs的值是500,也就是5秒,所以默认情况下系统会5秒钟启动一次回写线程,把dirty时间超过30秒的页回写,要注意的是,这种方式启动的回写线程只回写超时的dirty页,不会回写没超时的dirty页,可以通过修改/proc中的这两个值,细节查看内核函数wb_kupdate。
2 内存不足的时候: 这时并不将所有的dirty页写到磁盘,而是每次写大概1024个页面,直到空闲页面满足需求为止
3 写操作时发现脏页超过一定比例: 当脏页占系统内存的比例超过/proc/sys/vm/dirty_background_ratio 的时候,write系统调用会唤醒pdflush回写dirty page,直到脏页比例低于/proc/sys/vm/dirty_background_ratio,但write系统调用不会被阻塞,立即返回.当脏页占系统内存的比例超/proc/sys/vm/dirty_ratio的时候, write系统调用会被被阻塞,主动回写dirty page,直到脏页比例低于/proc/sys/vm/dirty_ratio
其他参考
http://blog.chinaunix.net/xmlrpc.php?r=blog/article&uid=25597477&id=4778214
http://blog.csdn.net/wavemcu/article/details/8544333这个对reboot流程讲解相对详细,特别有用的是kernel
阅读(2145) | 评论(0) | 转发(0) |