分类: LINUX
2008-09-22 23:05:40
|
Alin Fang (Fang Yunlin)
MSN:
G Talk:
Blog: http://www.alinblog.cn/
22 Sep, 2008
第一次修改
GNU
本人实验笔记,非权威文档。如有错误请告知作者。十分感谢!
netdump需要两台机器配合。
如果client端死机,则会把内存里面的信息dump到server。
client:
OS: Red Hat Enterprise Linux 4 update 7
server:
OS: Red Hat Enterprise Linux 4 update 7
client网络环境:
[root@station1 ~]# ifconfig
eth0 Link encap:Ethernet HWaddr 00:0C:29:22:61:B5
inet addr:192.168.0.1 Bcast:192.168.255.255 Mask:255.255.0.0
inet6 addr: fe80::20c:29ff:fe22:61b5/64 Scope:Link
UP BROADCAST RUNNING MULTICAST MTU:1500 Metric:1
RX packets:4 errors:0 dropped:0 overruns:0 frame:0
TX packets:9 errors:0 dropped:0 overruns:0 carrier:0
collisions:0 txqueuelen:1000
RX bytes:240 (240.0 b) TX bytes:546 (546.0 b)
Interrupt:185 Base address:0x2000
eth1 Link encap:Ethernet HWaddr 00:0C:29:22:61:BF
inet addr:10.66.0.194 Bcast:10.66.1.255 Mask:255.255.254.0
inet6 addr: fe80::20c:29ff:fe22:61bf/64 Scope:Link
UP BROADCAST RUNNING MULTICAST MTU:1500 Metric:1
RX packets:68 errors:0 dropped:0 overruns:0 frame:0
TX packets:57 errors:0 dropped:0 overruns:0 carrier:0
collisions:0 txqueuelen:1000
RX bytes:9962 (9.7 KiB) TX bytes:7759 (7.5 KiB)
Interrupt:177 Base address:0x2080
lo Link encap:Local Loopback
inet addr:127.0.0.1 Mask:255.0.0.0
inet6 addr: ::1/128 Scope:Host
UP LOOPBACK RUNNING MTU:16436 Metric:1
RX packets:8 errors:0 dropped:0 overruns:0 frame:0
TX packets:8 errors:0 dropped:0 overruns:0 carrier:0
collisions:0 txqueuelen:0
RX bytes:560 (560.0 b) TX bytes:560 (560.0 b)
[root@station1 ~]# route
Kernel IP routing table
Destination Gateway Genmask Flags Metric Ref Use Iface
10.66.0.0 * 255.255.254.0 U 0 0 0 eth1
169.254.0.0 * 255.255.0.0 U 0 0 0 eth1
192.168.0.0 * 255.255.0.0 U 0 0 0 eth0
default 10.66.1.254 0.0.0.0 UG 0 0 0 eth1
[root@station1 ~]#
server网络环境:
[root@station2 ~]# ifconfig
eth0 Link encap:Ethernet HWaddr 00:0C:29:32:64:A0
inet addr:192.168.0.2 Bcast:192.168.255.255 Mask:255.255.0.0
inet6 addr: fe80::20c:29ff:fe32:64a0/64 Scope:Link
UP BROADCAST RUNNING MULTICAST MTU:1500 Metric:1
RX packets:0 errors:0 dropped:0 overruns:0 frame:0
TX packets:9 errors:0 dropped:0 overruns:0 carrier:0
collisions:0 txqueuelen:1000
RX bytes:0 (0.0 b) TX bytes:546 (546.0 b)
Interrupt:185 Base address:0x2000
eth1 Link encap:Ethernet HWaddr 00:0C:29:32:64:AA
inet addr:10.66.0.114 Bcast:10.66.1.255 Mask:255.255.254.0
inet6 addr: fe80::20c:29ff:fe32:64aa/64 Scope:Link
UP BROADCAST RUNNING MULTICAST MTU:1500 Metric:1
RX packets:126 errors:0 dropped:0 overruns:0 frame:0
TX packets:55 errors:0 dropped:0 overruns:0 carrier:0
collisions:0 txqueuelen:1000
RX bytes:16122 (15.7 KiB) TX bytes:7651 (7.4 KiB)
Interrupt:177 Base address:0x2080
lo Link encap:Local Loopback
inet addr:127.0.0.1 Mask:255.0.0.0
inet6 addr: ::1/128 Scope:Host
UP LOOPBACK RUNNING MTU:16436 Metric:1
RX packets:8 errors:0 dropped:0 overruns:0 frame:0
TX packets:8 errors:0 dropped:0 overruns:0 carrier:0
collisions:0 txqueuelen:0
RX bytes:560 (560.0 b) TX bytes:560 (560.0 b)
[root@station2 ~]# route
Kernel IP routing table
Destination Gateway Genmask Flags Metric Ref Use Iface
10.66.0.0 * 255.255.254.0 U 0 0 0 eth1
169.254.0.0 * 255.255.0.0 U 0 0 0 eth1
192.168.0.0 * 255.255.0.0 U 0 0 0 eth0
default 10.66.1.254 0.0.0.0 UG 0 0 0 eth1
[root@station2 ~]#
[root@station2 ~]# rpm -q netdump-server
package netdump-server is not installed
[root@station2 ~]# mount /dev/cdrom /media/
mount: block device /dev/cdrom is write-protected, mounting read-only
[root@station2 ~]# cd /media/RedHat/RPMS/
[root@station2 RPMS]# rpm -ivh netdump-server-0.7.16-14.i386.rpm
warning: netdump-server-0.7.16-14.i386.rpm: V3 DSA signature: NOKEY, key ID db42a60e
Preparing... ########################################### [100%]
1:netdump-server ########################################### [100%]
[root@station2 RPMS]# service netdump-server
Usage: netdump-server {start|stop|status|restart|condrestart}
[root@station2 RPMS]# service netdump-server start
Starting netdump server: [ OK ]
[root@station2 RPMS]# chkconfig --level 35 netdump-server on
[root@station2 RPMS]# passwd netdump
Changing password for user netdump.
New UNIX password:
BAD PASSWORD: it is based on a dictionary word
Retype new UNIX password:
passwd: all authentication tokens updated successfully.
[root@station2 RPMS]#
安装netdump-server会在系统里面添加netdump这个用户。作为netdump时候传输数据之用。但是并不会给这个用户创建密码。所以这个用户默认是给锁住的。因此记得给netdump这个用户分配密码!
[root@station1 ~]# rpm -q netdump
netdump-0.7.16-14
[root@station1 ~]# vi /etc/sysconfig/netdump
指定netdump-server的IP地址
NETDUMPADDR=192.168.0.2
[root@station1 ~]# service netdump start
netdump@10.66.0.114's password:
initializing netdump [ OK ]
initializing netconsole [ OK ]
Message from syslogd@station1 at Mon Sep 22 17:47:38 2008 ...
station1 kernel: [...network console startup...]
[root@station1 ~]#
[root@station1 ~]# echo 1 > /proc/sys/kernel/sysrq
[root@station1 ~]# echo c > /proc/sysrq-trigger
然后netdump就给激活了。
当dump完数据,机器自动重启。
在server上检验数据
[root@station2 ~]# cd /var/crash/
[root@station2 crash]# ll
total 16
drwx------ 2 netdump netdump 4096 Sep 22 18:05 192.168.0.1-2008-09-22-18:04
drwx------ 2 netdump netdump 4096 Sep 22 17:48 magic
drwxr-xr-x 2 netdump netdump 4096 Aug 4 2007 scripts
[root@station2 crash]# cd 192.168.0.1-2008-09-22-18\:04/
[root@station2 192.168.0.1-2008-09-22-18:04]# ll
total 81656
-rw------- 1 netdump netdump 30592 Sep 22 18:05 log
-rw------- 1 netdump netdump 268439552 Sep 22 18:05 vmcore
[root@station2 192.168.0.1-2008-09-22-18:04]# ll -h
total 80M
-rw------- 1 netdump netdump 30K Sep 22 18:05 log
-rw------- 1 netdump netdump 257M Sep 22 18:05 vmcore
[root@station2 192.168.0.1-2008-09-22-18:04]#
vmcore就是发生crash的时候内存里的数据。
diskdump在Red Hat Enterprise Linux 4 update 4之前需要一块单独的硬盘。
在Red Hat Enterprise Linux 4 update 4以及之后,只需要一块单独的磁盘分区。
Red Hat Enterprise Linux 4 update 7
原本采用Red Hat Enterprise Linux 5 update 2,但是发现diskdump在RHEL5以及更高版本上被kdump所替代。
试验步骤
先划分一块分区给diskdump。
[root@dhcp-0-084 ~]# fdisk /dev/sdb
Device contains neither a valid DOS partition table, nor Sun, SGI or OSF disklabel
Building a new DOS disklabel. Changes will remain in memory only,
until you decide to write them. After that, of course, the previous
content won't be recoverable.
Warning: invalid flag 0x0000 of partition table 4 will be corrected by w(rite)
Command (m for help): p
Disk /dev/sdb: 4294 MB, 4294967296 bytes
255 heads, 63 sectors/track, 522 cylinders
Units = cylinders of 16065 * 512 = 8225280 bytes
Device Boot Start End Blocks Id System
Command (m for help): n
Command action
e extended
p primary partition (1-4)
p
Partition number (1-4): 1
First cylinder (1-522, default 1):
Using default value 1
Last cylinder or +size or +sizeM or +sizeK (1-522, default 522):
Using default value 522
Command (m for help): w
The partition table has been altered!
Calling ioctl() to re-read partition table.
Syncing disks.
partprobe[root@dhcp-0-084 ~]# partprobe
Warning: Unable to open /dev/hdc read-write (Read-only file system). /dev/hdc has been opened read-only.
No changes made to disk, exiting partprobe.
[root@dhcp-0-084 ~]#
我们用/dev/sdb1作为diskdump的数据临时存放点。
编辑diskdump配置文件。
[root@dhcp-0-084 RPMS]# rpm -qa | grep diskdump
diskdumputils-1.4.1-5
[root@dhcp-0-084 RPMS]# vi /etc/sysconfig/diskdump
DEVICE=/dev/sdb1
初始化并启动diskdump服务。
[root@dhcp-0-084 ~]# service diskdump initialformat
Formatting dump device:
Do you want to format /dev/sdb1 (yes/NO)? yes
/dev/sdb1: [100.0%]
[root@dhcp-0-084 ~]# service diskdump restart
Starting diskdump: [ OK ]
[root@dhcp-0-084 ~]# chkconfig --level 35 diskdump on
[root@dhcp-0-084 ~]#
然后模拟死机。
[root@dhcp-0-084 RPMS]# echo 1 > /proc/sys/kernel/sysrq
[root@dhcp-0-084 RPMS]# echo c > /proc/sysrq-trigger
此时diskdump开始工作。把内存中的数据dump到/dev/sdb1中。
diskdump之后必须手动重启机器。
重启机器后,diskdump会把/dev/sdb1里面的数据拷到/var/crash文件夹下。
检查diskdump数据。
[root@dhcp-0-084 ~]# cd /var/crash/
[root@dhcp-0-084 crash]# ll
total 12
drwx------ 2 root root 4096 Sep 22 19:52 127.0.0.1-2008-09-22-19:49
drwxr-xr-x 2 netdump netdump 4096 Mar 25 23:33 scripts
[root@dhcp-0-084 crash]# cd 127.0.0.1-2008-09-22-19\:49/
[root@dhcp-0-084 127.0.0.1-2008-09-22-19:49]# ll -h
total 86M
-rw------- 1 root root 513M Sep 22 19:52 vmcore
[root@dhcp-0-084 127.0.0.1-2008-09-22-19:49]#
client OS: Red Hat Enterprise Linux 5 update 2
client IP: 10.66.0.157
server OS: Red Hat Enterprise Linux 4 update 7
server IP: 10.66.0.84
kdump属于kexec-tools-1.102pre-21.el5这个包。一般装系统都有装上。
kdump支持N种方式存储内存数据,包括裸设备,文件系统,nfs,ssh。并且能设定dump前和dump后运行脚本以及执行其他动作。正所谓很好很强大。
kdump的dump机制是:预先生成一个crashkernel,在内核crash的时候,激活这个crashkernel,用这个crashkernel载入的小型系统dump处于crash状态的内核。
这次我会配置kdump把内存数据scp到备份服务器上。
在client上:
[root@dhcp-0-157 ~]# cat /etc/redhat-release
Red Hat Enterprise Linux Server release 5.2 (Tikanga)
[root@dhcp-0-157 ~]# cd /misc/cd/Server
[root@dhcp-0-157 Server]# rpm -ivh busybox-1.2.0-4.el5.i386.rpm
warning: busybox-1.2.0-4.el5.i386.rpm: Header V3 DSA signature: NOKEY, key ID 37017186
Preparing... ########################################### [100%]
1:busybox ########################################### [100%]
[root@dhcp-0-157 Server]# rpm -ivh kexec-tools-1.102pre-21.el5.i386.rpm
warning: kexec-tools-1.102pre-21.el5.i386.rpm: Header V3 DSA signature: NOKEY, key ID 37017186
Preparing... ########################################### [100%]
1:kexec-tools ########################################### [100%]
[root@dhcp-0-157 Server]#
[root@dhcp-0-157 ~]# vim /etc/kdump.conf
net root@10.66.0.84
对kernel开启kdump支持
[root@dhcp-0-157 ~]# vim /boot/grub/grub.conf
在kernel选项上添加crashkernel参数。
参数格式是:
crashkernel=nn[KMG]@ss[KMG]
nn表示要为crashkernel预留多少内存
ss表示为crashkernel预留内存的起始位置
default=0
timeout=5
splashimage=(hd0,0)/grub/splash.xpm.gz
hiddenmenu
title Red Hat Enterprise Linux Server (2.6.18-92.el5)
root (hd0,0)
kernel /vmlinuz-2.6.18-92.el5 ro root=LABEL=/ crashkernel=256M@16M
initrd /initrd-2.6.18-92.el5.img
重启电脑使新参数生效。
[root@dhcp-0-157 ~]# service kdump
Usage: /etc/init.d/kdump {start|stop|status|restart|propagate}
[root@dhcp-0-157 ~]# service kdump propagate
Generating new ssh keys... done.
The authenticity of host '10.66.0.84 (10.66.0.84)' can't be established.
RSA key fingerprint is 31:c2:d8:b6:eb:2e:03:64:cd:ba:56:e9:49:6e:5d:6c.
Are you sure you want to continue connecting (yes/no)? yes
Warning: Permanently added '10.66.0.84' (RSA) to the list of known hosts.
root@10.66.0.84's password:
/root/.ssh/kdump_id_rsa.pub has been added to ~root/.ssh/authorized_keys2 on 10.66.0.84
[root@dhcp-0-157 ~]# service kdump restart
Stopping kdump:[ OK ]
No kdump initial ramdisk found.[WARNING]
Rebuilding /boot/initrd-2.6.18-92.el5kdump.img
Starting kdump:[ OK ]
[root@dhcp-0-157 ~]# chkconfig --level 35 kdump on
[root@dhcp-0-157 ~]#
[root@dhcp-0-157 ~]# echo 1 > /proc/sys/kernel/sysrq
[root@dhcp-0-157 ~]# echo c > /proc/sysrq-trigger
在server的/var/crash下可以看到由client转储过来的内核数据。
另外,kdump在完成内核转储后会自动重启。