全部博文(389)
分类: LINUX
2013-06-10 00:06:11
一次linux的groupadd hang住处理
某日从同事那里要了一台linux服务器,准备来安装timesten。在执行groupadd ttadmin时,命令一直没有响应,卡住了
使用top命令查看系统资源,发现系统处于一个很空间的状态.没有任何异常
Cpu(s): 0.5%us, 0.0%sy, 0.0%ni, 99.5%id, 0.0%wa, 0.0%hi, 0.0%si, 0.0%st
top -p 7621 命令可以看到当前的进程没有任何运行,状态一直处于S
top - 17:14:09 up 1 day, 22:03, 7 users, load average: 0.06, 0.25, 0.34
Tasks: 1 total, 0 running, 1 sleeping, 0 stopped, 0 zombie
Cpu(s): 0.0%us, 0.0%sy, 0.0%ni,100.0%id, 0.0%wa, 0.0%hi, 0.0%si, 0.0%st
Mem: 2395192k total, 2217776k used, 177416k free, 202580k buffers
Swap: 4456440k total, 0k used, 4456440k free, 1263728k cached
PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND
root 17 0 10160 796 656 S 0.0 0.0 0:00.00 groupadd
于是使用了pstack 发现最后调用的函数是一个sleep函数.进程为什么会引起睡眠呢?后面有调用ldap相关的函数,但是当时我没太在意,以为是要进行用户验证之类的.
[root@IP119 ~]# pstack 7621
#1 0x00000038b3299fc4 in sleep () from /lib64/libc.so.6
#2 0x00002adbc79039e8 in ?? () from /lib64/libnss_ldap.so.2
#3 0x00002adbc7904494 in _nss_ldap_search_s () from /lib64/libnss_ldap.so.2
#4 0x00002adbc790505b in _nss_ldap_getbyname () from /lib64/libnss_ldap.so.2
#5 0x00002adbc79068cb in _nss_ldap_getgrnam_r () from /lib64/libnss_ldap.so.2
#6 0x00000038b3298105 in () from /lib64/libc.so.6
#7 0x00000038b3297760 in getgrnam () from /lib64/libc.so.6
#8 0x000000000040208d in fflush ()
#9 0x00000038b321d994 in __libc_start_main () from /lib64/libc.so.6
#10 0x0000000000401d19 in fflush ()
#11 0x00007fff05e305c8 in ?? ()
#12 0x0000000000000000 in ?? ()
再次使用
[root@IP119 7559]# strace groupadd ttadmin 对这个命令进程跟踪
munmap(0x2b6ab75a9000, 4096) = 0
socket(PF_INET, SOCK_STREAM, IPPROTO_IP) = 5
fcntl(5, F_SETFD, FD_CLOEXEC) = 0
setsockopt(5, SOL_SOCKET, SO_KEEPALIVE, [1], 4) = 0
setsockopt(5, SOL_TCP, TCP_NODELAY, [1], 4) = 0
fcntl(5, F_GETFL) = 0x2 (flags O_RDWR)
fcntl(5, F_SETFL, O_RDWR|O_NONBLOCK) = 0
connect(5, {sa_family=AF_INET, sin_port=htons(389), sin_addr=inet_addr("172.28.10.117")}, 16) = -1 EINPROGRESS (Operation now in progress)
poll([{fd=5, events=POLLOUT|POLLERR|POLLHUP}], 1, 120000) = 1 ([{fd=5, revents=POLLERR|POLLHUP}])
getpeername(5, 0x7fff28b3b7a0, [4294967424]) = -1 ENOTCONN (Transport endpoint is not connected)
read(5, 0x7fff28b3b798, 1) = -1 ECONNREFUSED (Connection refused)
shutdown(5, 2 /* send and receive */) = -1 ENOTCONN (Transport endpoint is not connected)
close(5) = 0
stat("/etc/localtime", {st_mode=S_IFREG|0644, st_size=118, ...}) = 0
stat("/etc/localtime", {st_mode=S_IFREG|0644, st_size=118, ...}) = 0
stat("/etc/localtime", {st_mode=S_IFREG|0644, st_size=118, ...}) = 0
sendto(4, "<86>Jun 9 12:38:50 groupadd[777"..., 97, MSG_NOSIGNAL, NULL, 0) = 97
rt_sigprocmask(SIG_BLOCK, [CHLD], [], 8) = 0
rt_sigaction(SIGCHLD, NULL, {SIG_DFL, [], 0}, 8) = 0
rt_sigprocmask(SIG_SETMASK, [], NULL, 8) = 0
nanosleep({8, 0}, {8, 0}) = 0
stat("/etc/ldap.conf", {st_mode=S_IFREG|0644, st_size=9168, ...}) = 0
geteuid() = 0
open("/etc/hosts", O_RDONLY) = 5
fcntl(5, F_GETFD) = 0
fcntl(5, F_SETFD, FD_CLOEXEC) = 0
fstat(5, {st_mode=S_IFREG|0644, st_size=258, ...}) = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x2b6ab75a9000
read(5, "# Do not remove the following li"..., 4096) = 258
read(5, "", 4096) = 0
close(5) = 0
munmap(0x2b6ab75a9000, 4096) = 0
open("/etc/hosts", O_RDONLY) = 5
fcntl(5, F_GETFD) = 0
fcntl(5, F_SETFD, FD_CLOEXEC) = 0
fstat(5, {st_mode=S_IFREG|0644, st_size=258, ...}) = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x2b6ab75a9000
read(5, "# Do not remove the following li"..., 4096) = 258
close(5) = 0
munmap(0x2b6ab75a9000, 4096) = 0
socket(PF_INET, SOCK_STREAM, IPPROTO_IP) = 5
fcntl(5, F_SETFD, FD_CLOEXEC) = 0
setsockopt(5, SOL_SOCKET, SO_KEEPALIVE, [1], 4) = 0
setsockopt(5, SOL_TCP, TCP_NODELAY, [1], 4) = 0
fcntl(5, F_GETFL) = 0x2 (flags O_RDWR)
fcntl(5, F_SETFL, O_RDWR|O_NONBLOCK) = 0
connect(5, {sa_family=AF_INET, sin_port=htons(389), sin_addr=inet_addr("172.28.10.117")}, 16) = -1 EINPROGRESS (Operation now in progress)
poll([{fd=5, events=POLLOUT|POLLERR|POLLHUP}], 1, 120000) = 1 ([{fd=5, revents=POLLERR|POLLHUP}])
getpeername(5, 0x7fff28b3b7a0, [4294967424]) = -1 ENOTCONN (Transport endpoint is not connected)
read(5, 0x7fff28b3b798, 1) = -1 ECONNREFUSED (Connection refused)
shutdown(5, 2 /* send and receive */) = -1 ENOTCONN (Transport endpoint is not connected)
close(5) = 0
stat("/etc/localtime", {st_mode=S_IFREG|0644, st_size=118, ...}) = 0
stat("/etc/localtime", {st_mode=S_IFREG|0644, st_size=118, ...}) = 0
stat("/etc/localtime", {st_mode=S_IFREG|0644, st_size=118, ...}) = 0
sendto(4, "<86>Jun 9 12:38:58 groupadd[777"..., 98, MSG_NOSIGNAL, NULL, 0) = 98
rt_sigprocmask(SIG_BLOCK, [CHLD], [], 8) = 0
rt_sigaction(SIGCHLD, NULL, {SIG_DFL, [], 0}, 8) = 0
rt_sigprocmask(SIG_SETMASK, [], NULL, 8) = 0
nanosleep({16, 0}, 0x7fff28b3ba00) = ? ERESTART_RESTARTBLOCK (To be restarted)
发现这个进程会去一直去连接172.28.10.117,但是连接不成功,从而导致进程进入睡眠。进一步确认该服务器原来是一台ldap服务器,不过被移除了.
应该是groupadd ,useradd,groupdel之类的命令在执行时,如果配置了ldap,需要向ladp进行一些确认。ldap服务器没响应从而使这些命令进行睡眠状态
结果移除这台服务器的ldap.命令执行正常了.