1. 正常退出
如果你用 strace 跟一下最简单的linux命令,比如ls、lsof等,你会发现这些进程退出都会调用系统调用exit_group,那我们就从这个系统调用开始。
[ kernel/exit.c ]
871 NORET_TYPE void
872 do_group_exit(int exit_code)
873 {
874 BUG_ON(exit_code & 0x80); /* core dumps don't get here */
875
876 if (current->signal->group_exit)
877 exit_code = current->signal->group_exit_code;
878 else if (!thread_group_empty(current)) {
879 struct signal_struct *const sig = current->signal;
880 struct sighand_struct *const sighand = current->sighand;
881 read_lock(&tasklist_lock);
882 spin_lock_irq(&sighand->siglock);
883 if (sig->group_exit)
884 /* Another thread got here before we took the lock. */
885 exit_code = sig->group_exit_code;
886 else {
887 sig->group_exit = 1;
888 sig->group_exit_code = exit_code;
889 zap_other_threads(current);
890 }
891 spin_unlock_irq(&sighand->siglock);
892 read_unlock(&tasklist_lock);
893 }
894
895 do_exit(exit_code);
896 /* NOTREACHED */
897 }
898
899 /*
900 * this kills every thread in the thread group. Note that any externally
901 * wait4()-ing process will get the correct exit code - even if this
902 * thread is not the thread group leader.
903 */
904 asmlinkage void sys_exit_group(int error_code)
905 {
906 do_group_exit((error_code & 0xff) << 8);
907 }
上
面的sys_exit_group就是系统统调用exit_group的实现,它调用了do_group_exit,而do_group_exit里当然
处理退出各个group的操作,不过我们不关心,我们关心的是它调用了do_exit。一个进程正常退出就是指exit,其实在内核里的实现就是
do_exit(废话!)。还要注意,2.6.9内核有exit_group系统调用,2.6.32里已经没有了,在哪个版本消失的?这个我们以后再说。
[kernel/exit.c]
783 asmlinkage NORET_TYPE void do_exit(long code)
784 {
785 struct task_struct *tsk = current;
786
787 profile_task_exit(tsk);
788
789 if (unlikely(in_interrupt()))
790 panic("Aiee, killing interrupt handler!");
791 if (unlikely(!tsk->pid))
792 panic("Attempted to kill the idle task!");
793 if (unlikely(tsk->pid == 1))
794 panic("Attempted to kill init!");
795 if (tsk->io_context)
796 exit_io_context();
797 tsk->flags |= PF_EXITING;
798 del_timer_sync(&tsk->real_timer);
799
800 if (unlikely(in_atomic()))
801 printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n",
802 current->comm, current->pid,
803 preempt_count());
804
805 if (unlikely(current->ptrace & PT_TRACE_EXIT)) {
806 current->ptrace_message = code;
807 ptrace_notify((PTRACE_EVENT_EXIT << 8) | SIGTRAP);
808 }
809
810 acct_process(code);
811 __exit_mm(tsk);
812
813 exit_sem(tsk);
814 __exit_files(tsk);
815 __exit_fs(tsk);
816 exit_namespace(tsk);
817 exit_thread();
do_exit
做的事情不多,退出mm(每个struct
task_struct管辖的内存都在这个mm里),退出namespace,退出thread(主要是关闭TSS段上的IOMAP)等,我们关心的是
__exit_files(),这里就不贴代码了,都是一些短小的函数(功能单一,函数短小,好的编码风格,但这么多的函数,如何起名字?这是个挑
战),__exit_files()调用put_files_struct(),而put_files_struct()调用
close_files(),关闭这个退出进程的所有打开文件,close_files接着调用filp_close():
[fs/open.c]
989 int filp_close(struct file *filp, fl_owner_t id)
990 {
991 int retval;
992
993 /* Report and clear outstanding errors */
994 retval = filp->f_error;
995 if (retval)
996 filp->f_error = 0;
997
998 if (!file_count(filp)) {
999 printk(KERN_ERR "VFS: Close: file count is 0\n");
1000 return retval;
1001 }
1002
1003 if (filp->f_op && filp->f_op->flush) {
1004 int err = filp->f_op->flush(filp);
1005 if (!retval)
1006 retval = err;
1007 }
1008
1009 dnotify_flush(filp, id);
1010 locks_remove_posix(filp, id);
1011 fput(filp);
1012 return retval;
1013 }
上
面是filp_close的实现,再清楚不过了,只要是普通的文件,谁打开谁就得负责关闭,而且关闭之前必须flush。有些程序open了某个文件,没
有调用close就正常退出了,这种情况内核其实也通过do_exit帮这个要死的进程关闭(并flush)了它打开的文件,所以不用担心,没有什么资源
泄漏。
2. 被信号杀死
这涉及到信号,linux的信号机制比BSD的复杂,这里不详述, 已经讲得很清楚。这里要关注的是,内核只在回到用户空间之前处理信号,处理信号的如入口是do_signal:
[arch/i386/kernel/signal.c]
573 int fastcall do_signal(struct pt_regs *regs, sigset_t *oldset)
574 {
575 siginfo_t info;
576 int signr;
577 struct k_sigaction ka;
578
579 /*
580 * We want the common case to go fast, which
581 * is why we may in certain cases get here from
582 * kernel mode. Just return without doing anything
583 * if so.
584 */
585 if ((regs->xcs & 3) != 3)
586 return 1;
587
588 if (current->flags & PF_FREEZE) {
589 refrigerator(0);
590 goto no_signal;
591 }
592
593 if (!oldset)
594 oldset = ¤t->blocked;
595
596 signr = get_signal_to_deliver(&info, &ka, regs, NULL);
get_signal_to_delive()的代码很多,但主要就是循环调用dequeue_signal,从信号队列里拿出所有待处理的信号,逐一处理之,注释如下。
[kernel/signal.c --> get_signal_to_delive]
1831 int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka,
1832 struct pt_regs *regs, void *cookie)
1833 {
1834 sigset_t *mask = ¤t->blocked;
1835 int signr = 0;
1836
1837 relock:
1838 spin_lock_irq(¤t->sighand->siglock);
1839 for (;;) {
1840 struct k_sigaction *ka;
1841
1842 if (unlikely(current->signal->group_stop_count > 0) &&
1843 handle_group_stop())
1844 goto relock;
1845
1846 signr = dequeue_signal(current, mask, info); // 从当前进程的信号队列里取出信号
1847
1848 if (!signr)
1849 break; /* will return 0 */
1850
1851 if ((current->ptrace & PT_PTRACED) && signr
!= SIGKILL) { // 如果有strace跟踪当前进程,且无kill信号,则处理之。此代码块内就是strace工作原理。
1852 ptrace_signal_deliver(regs, cookie);
1853
1854 /* Let the debugger run. */
1855 ptrace_stop(signr, info);
1856
1857 /* We're back. Did the debugger cancel the sig? */
1858 signr = current->exit_code;
1859 if (signr == 0)
1860 continue;
1861
1862 current->exit_code = 0;
1863
1864 /* Update the siginfo structure if the signal has
1865 changed. If the debugger wanted something
1866 specific in the siginfo structure then it should
1867 have updated *info via PTRACE_SETSIGINFO. */
1868 if (signr != info->si_signo) {
1869 info->si_signo = signr;
1870 info->si_errno = 0;
1871 info->si_code = SI_USER;
1872 info->si_pid = current->parent->pid;
1873 info->si_uid = current->parent->uid;
1874 }
1875
1876 /* If the (new) signal is now blocked, requeue it. */
1877 if (sigismember(¤t->blocked, signr)) {
1878 specific_send_sig_info(signr, info, current);
1879 continue;
1880 }
1881 }
1882
1883 ka = ¤t->sighand->action[signr-1];
1884 if (ka->sa.sa_handler == SIG_IGN) /* Do nothing. */ // 如果用户要求忽略,那就继续循环,处理下一个信号
1885 continue;
1886 if (ka->sa.sa_handler != SIG_DFL) { // 如果用户自己实现了信号处理函数,则执行之
1887 /* Run the handler. */
1888 *return_ka = *ka;
1889
1890 if (ka->sa.sa_flags & SA_ONESHOT)
1891 ka->sa.sa_handler = SIG_DFL;
1892
1893 break; /* will return non-zero "signr" value */
1894 }
1895
1896 /*
1897 * Now we are doing the default action for this signal.
1898 */
1899 if (sig_kernel_ignore(signr)) /* Default is nothing. */
1900 continue;
1901
1902 /* Init gets no signals it doesn't want. */
1903 if (current->pid == 1)
1904 continue;
1905
1906 if (sig_kernel_stop(signr)) { // 如果是SIGSTOP,SIGSTP,SIGTTIN,SIGTTOU之一,则执行下面的代码块
1907 /*
1908 * The default action is to stop all threads in
1909 * the thread group. The job control signals
1910 * do nothing in an orphaned pgrp, but SIGSTOP
1911 * always works. Note that siglock needs to be
1912 * dropped during the call to is_orphaned_pgrp()
1913 * because of lock ordering with tasklist_lock.
1914 * This allows an intervening SIGCONT to be posted.
1915 * We need to check for that and bail out if necessary.
1916 */
1917 if (signr == SIGSTOP) {
1918 do_signal_stop(signr); /* releases siglock */
1919 goto relock;
1920 }
1921 spin_unlock_irq(¤t->sighand->siglock);
1922
1923 /* signals can be posted during this window */
1924
1925 if (is_orphaned_pgrp(process_group(current)))
1926 goto relock;
1927
1928 spin_lock_irq(¤t->sighand->siglock);
1929 if (unlikely(sig_avoid_stop_race())) {
1930 /*
1931 * Either a SIGCONT or a SIGKILL signal was
1932 * posted in the siglock-not-held window.
1933 */
1934 continue;
1935 }
1936
1937 do_signal_stop(signr); /* releases siglock */
1938 goto relock;
1939 }
1940
1941 spin_unlock_irq(¤t->sighand->siglock);
1942
1943 /*
1944 * Anything else is fatal, maybe with a core dump.
1945 */
1946 current->flags |= PF_SIGNALED;
1947 if (sig_kernel_coredump(signr) &&
1948 do_coredump((long)signr, signr, regs)) { // 注意,是SIGSEGV或SIGQUIT或SIGILL等信号,要coredump了!
1949 /*
1950 * That killed all other threads in the group and
1951 * synchronized with their demise, so there can't
1952 * be any more left to kill now. The group_exit
1953 * flags are set by do_coredump. Note that
1954 * thread_group_empty won't always be true yet,
1955 * because those threads were blocked in __exit_mm
1956 * and we just let them go to finish dying.
1957 */
1958 const int code = signr | 0x80;
1959 BUG_ON(!current->signal->group_exit);
1960 BUG_ON(current->signal->group_exit_code != code);
1961 do_exit(code); // 即使coredump,也要调用do_exit的
1962 /* NOTREACHED */
1963 }
1964
1965 /*
1966 * Death signals, no core dump.
1967 */
1968 do_group_exit(signr); // 上面的都不成立,进程退出
1969 /* NOTREACHED */
1970 }
1971 spin_unlock_irq(¤t->sighand->siglock);
1972 return signr;
1973 }
看上面代码,coredump如果成功,调用do_exit,要flush的;如果coredump不成功,下面第1968行do_group_exit里也要调用do_exit,还是要flush的。
总结
linux身为操作系统,对进程死掉这种情况必须处理的干干净净,因为这是经常经常发生的事,所以进程只要退出,哪怕是被kill信号杀死,哪怕是coredump,都是要调用flush的。
所以yahoo利用linux下的这一特性,做了一个虚拟设备,进程启动时打开此设备,这之后,只要进程一死,不管是怎么死的,这个虚拟设备都会知道(因
为设备可以截获flush),然后干些重要的事情......不能再说了,梅坚说了,今后不准顺便透露友公司的技术......所以就此打住,我可不想被
fire。
阅读(1553) | 评论(0) | 转发(0) |