server hung task timeout-wenhq-ChinaUnix博客

Henry
首页　| 　博文目录　| 　关于我
wenhq
博客访问： 493165
博文数量： 108
博客积分： 25
博客等级：民兵
技术积分： 1134
用户组：普通用户
注册时间： 2010-03-29 19:43
文章分类
全部博文（108）
Zeppelin（1）
AWS Redshif（1）
aws（1）
Hbase（1）
ambari（1）
Hue（1）
Hive（1）
Scheduler（1）
Windows（1）
Puppet（1）
Git（1）
性能（1）
Redis（0）
Nginx（2）
Docker（1）
Hadoop（3）
Glusterfs（5）
日志管理（2）
Perl（4）
Ganglia（1）
监控（1）
MongoDB（1）
日记（2）
ubuntu（3）
ansible（3）
saltstack（6）
CI（1）
Python（13）
Mysql（1）
云计算（2）
Shell（8）
Linux（33）
Nginx（2）
未分配的博文（2）
文章存档
2016年（10）
2015年（9）
2014年（73）
2013年（16）
我的朋友
joepayne
相关博文
server hung task timeout
分类：系统运维
2013-12-09 10:21:03
						关于rcu的一些知识 http://www.ibm.com/developerworks/cn/linux/l-rcu/

					001
				
					/**
				
					002
				
					 * Detect Hung Task
				
					003
				
					 *
				
					004
				
					 * kernel/hung_task.c - kernel thread for detecting tasks stuck in D state
				
					005
				
					 *
				
					006
				
					 */
				
					007
				
					008
				
					#include 
				
					009
				
					#include 
				
					010
				
					#include 
				
					011
				
					#include 
				
					012
				
					#include 
				
					013
				
					#include 
				
					014
				
					#include 
				
					015
				
					#include 
				
					016
				
					#include 
				
					017
				
					#include 
				
					018
				
					019
				
					/**
				
					020
				
					 * The number of tasks checked:
				
					021
				
					 */
				
					022
				
					unsigned long __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT;
				
					023
				
					024
				
					/**
				
					025
				
					 * Limit number of tasks checked in a batch.
				
					026
				
					 *
				
					027
				
					 * This value controls the preemptibility of khungtaskd since preemption
				
					028
				
					 * is disabled during the critical section. It also controls the size of
				
					029
				
					 * the RCU grace period. So it needs to be upper-bound.
				
					030
				
					 */
				
					031
				
					#define HUNG_TASK_BATCHING 1024
				
					032
				
					033
				
					/**
				
					034
				
					 * Zero means infinite timeout - no checking done:
				
					035
				
					 */
				
					036
				
					unsigned long __read_mostly sysctl_hung_task_timeout_secs = 120;
				
					037
				
					038
				
					unsigned long __read_mostly sysctl_hung_task_warnings = 10;
				
					039
				
					040
				
					static int __read_mostly did_panic;
				
					041
				
					042
				
					static struct task_struct *watchdog_task;
				
					043
				
					044
				
					/**
				
					045
				
					 * Should we panic (and reboot, if panic_timeout= is set) when a
				
					046
				
					 * hung task is detected:
				
					047
				
					 */
				
					048
				
					unsigned int __read_mostly sysctl_hung_task_panic =
				
					049
				
					                CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE;
				
					050
				
					051
				
					static int __init hung_task_panic_setup(char *str)
				
					052
				
					{
				
					053
				
					    sysctl_hung_task_panic = simple_strtoul(str, NULL, 0);
				
					054
				
					055
				
					    return 1;
				
					056
				
					}
				
					057
				
					__setup("hung_task_panic=", hung_task_panic_setup);
				
					058
				
					059
				
					static int
				
					060
				
					hung_task_panic(struct notifier_block *this, unsigned long event, void *ptr)
				
					061
				
					{
				
					062
				
					    did_panic = 1;
				
					063
				
					064
				
					    return NOTIFY_DONE;
				
					065
				
					}
				
					066
				
					067
				
					static struct notifier_block panic_block = {
				
					068
				
					    .notifier_call = hung_task_panic,
				
					069
				
					};
				
					070
				
					071
				
					static void check_hung_task(struct task_struct *t, unsigned long timeout)
				
					072
				
					{
				
					073
				
					    unsigned long switch_count = t->nvcsw + t->nivcsw;
				
					074
				
					075
				
					    /**
				
					076
				
					     * Ensure the task is not frozen.
				
					077
				
					     * Also, when a freshly created task is scheduled once, changes
				
					078
				
					     * its state to TASK_UNINTERRUPTIBLE without having ever been
				
					079
				
					     * switched out once, it musn't be checked.
				
					080
				
					     */
				
					081
				
					    if (unlikely(t->flags & PF_FROZEN || !switch_count))
				
					082
				
					        return;
				
					083
				
					084
				
					    if (switch_count != t->last_switch_count) {
				
					085
				
					        t->last_switch_count = switch_count;
				
					086
				
					        return;
				
					087
				
					    }
				
					088
				
					    if (!sysctl_hung_task_warnings)
				
					089
				
					        return;
				
					090
				
					    sysctl_hung_task_warnings--;
				
					091
				
					092
				
					    /**
				
					093
				
					     * Ok, the task did not get scheduled for more than 2 minutes,
				
					094
				
					     * complain:
				
					095
				
					     */
				
					096
				
					    printk(KERN_ERR "INFO: task %s:%d blocked for more than "
				
					097
				
					            "%ld seconds.\n", t->comm, t->pid, timeout);
				
					098
				
					    printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
				
					099
				
					            " disables this message.\n");
				
					100
				
					    sched_show_task(t);
				
					101
				
					    __debug_show_held_locks(t);
				
					102
				
					103
				
					    touch_nmi_watchdog();
				
					104
				
					105
				
					    if (sysctl_hung_task_panic)
				
					106
				
					        panic("hung_task: blocked tasks");
				
					107
				
					}
				
					108
				
					109
				
					/**
				
					110
				
					 * To avoid extending the RCU grace period for an unbounded amount of time,
				
					111
				
					 * periodically exit the critical section and enter a new one.
				
					112
				
					 *
				
					113
				
					 * For preemptible RCU it is sufficient to call rcu_read_unlock in order
				
					114
				
					 * exit the grace period. For classic RCU, a reschedule is required.
				
					115
				
					 */
				
					116
				
					static void rcu_lock_break(struct task_struct *g, struct task_struct *t)
				
					117
				
					{
				
					118
				
					    get_task_struct(g);
				
					119
				
					    get_task_struct(t);
				
					120
				
					    rcu_read_unlock();
				
					121
				
					    cond_resched();
				
					122
				
					    rcu_read_lock();
				
					123
				
					    put_task_struct(t);
				
					124
				
					    put_task_struct(g);
				
					125
				
					}
				
					126
				
					127
				
					/**
				
					128
				
					 * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for
				
					129
				
					 * a really long time (120 seconds). If that happens, print out
				
					130
				
					 * a warning.
				
					131
				
					 */
				
					132
				
					static void check_hung_uninterruptible_tasks(unsigned long timeout)
				
					133
				
					{
				
					134
				
					    int max_count = sysctl_hung_task_check_count;
				
					135
				
					    int batch_count = HUNG_TASK_BATCHING;
				
					136
				
					    struct task_struct *g, *t;
				
					137
				
					138
				
					    /**
				
					139
				
					     * If the system crashed already then all bets are off,
				
					140
				
					     * do not report extra hung tasks:
				
					141
				
					     */
				
					142
				
					    if (test_taint(TAINT_DIE) || did_panic)
				
					143
				
					        return;
				
					144
				
					145
				
					    rcu_read_lock();
				
					146
				
					    do_each_thread(g, t) {
				
					147
				
					        if (!max_count--)
				
					148
				
					            goto unlock;
				
					149
				
					        if (!--batch_count) {
				
					150
				
					            batch_count = HUNG_TASK_BATCHING;
				
					151
				
					            rcu_lock_break(g, t);
				
					152
				
					            /** Exit if t or g was unhashed during refresh. */
				
					153
				
					            if (t->state == TASK_DEAD || g->state == TASK_DEAD)
				
					154
				
					                goto unlock;
				
					155
				
					        }
				
					156
				
					        /** use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
				
					157
				
					        if (t->state == TASK_UNINTERRUPTIBLE)
				
					158
				
					            check_hung_task(t, timeout);
				
					159
				
					    } while_each_thread(g, t);
				
					160
				
					 unlock:
				
					161
				
					    rcu_read_unlock();
				
					162
				
					}
				
					163
				
					164
				
					static unsigned long timeout_jiffies(unsigned long timeout)
				
					165
				
					{
				
					166
				
					    /** timeout of 0 will disable the watchdog */
				
					167
				
					    return timeout ? timeout * HZ : MAX_SCHEDULE_TIMEOUT;
				
					168
				
					}
				
					169
				
					170
				
					/**
				
					171
				
					 * Process updating of timeout sysctl
				
					172
				
					 */
				
					173
				
					int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
				
					174
				
					                  void __user *buffer,
				
					175
				
					                  size_t *lenp, loff_t *ppos)
				
					176
				
					{
				
					177
				
					    int ret;
				
					178
				
					179
				
					    ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
				
					180
				
					181
				
					    if (ret || !write)
				
					182
				
					        goto out;
				
					183
				
					184
				
					    wake_up_process(watchdog_task);
				
					185
				
					186
				
					 out:
				
					187
				
					    return ret;
				
					188
				
					}
				
					189
				
					190
				
					/**
				
					191
				
					 * kthread which checks for tasks stuck in D state
				
					192
				
					 */
				
					193
				
					static int watchdog(void *dummy)
				
					194
				
					{
				
					195
				
					    set_user_nice(current, 0);
				
					196
				
					197
				
					    for ( ; ; ) {
				
					198
				
					        unsigned long timeout = sysctl_hung_task_timeout_secs;
				
					199
				
					200
				
					        while (schedule_timeout_interruptible(timeout_jiffies(timeout)))
				
					201
				
					            timeout = sysctl_hung_task_timeout_secs;
				
					202
				
					203
				
					        check_hung_uninterruptible_tasks(timeout);
				
					204
				
					    }
				
					205
				
					206
				
					    return 0;
				
					207
				
					}
				
					208
				
					209
				
					static int __init hung_task_init(void)
				
					210
				
					{
				
					211
				
					    atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
				
					212
				
					    watchdog_task = kthread_run(watchdog, NULL, "khungtaskd");
				
					213
				
					214
				
					    return 0;
				
					215
				
					}
				
					216
				
					217
				
					module_init(hung_task_init);
				
	导致服务器挂起，sysctl -a|grep  hung  默认是否应该设置为0。这样就避免挂起。
阅读(1832) | 评论(0) | 转发(0) |
上一篇：linux singal 含义(部分)
下一篇：centos monit
给主人留下些什么吧！~~
感谢所有关心和支持过ChinaUnix的朋友们
16024965号-6