分类: LINUX
2010-07-22 18:32:10
二.nagios配置
1.在服务器端安装nrpe(此处使用2.12版本)
#tar zxvf nrpe-2.12.tar.gz
#cd nrpe-2.12
#./configure (因为之前安装了nagios-plugins,所以nrpe默认安装在/usr/local/nagios/下,也就是也nagios-plugins在同一个安装目
录下)
#make all
#make install-plugin
#make install-daemon
#make install-daemon-config
# ls /usr/local/nagios/libexec/check_nrpe
/usr/local/nagios/libexec/check_nrpe
此文件出现,表明安装成功
# ll /usr/local/nagios/
total 24
drwxrwxr-x 2 nagios nagios 4096 Jul 21 19:09 bin
drwxrwxr-x 3 nagios nagios 4096 Jul 22 13:35 etc
drwxrwxr-x 2 nagios nagios 4096 Jul 21 19:09 libexec
drwxrwxr-x 2 nagios nagios 4096 Jul 21 18:57 sbin
drwxrwxr-x 10 nagios nagios 4096 Jul 21 19:03 share
drwxrwxr-x 5 nagios nagios 4096 Jul 22 14:25 var
注意此时,在nagios目录下的所有文件与子目录所有者与所属组都为nagios,但是一个除外,/usr/local/nagios/etc/htpasswd.usrs为 root
root,以后再添加的文件也同样为nagios nagios,这里如果出现差错,后面可能会出权限问题。
2.配置nagios主配置文件nagios.cfg
# cat nagios.cfg 只写出改动文件,下同
cfg_file=/usr/local/nagios/etc/objects/commands.cfg
cfg_file=/usr/local/nagios/etc/objects/contacts.cfg
cfg_file=/usr/local/nagios/etc/objects/timeperiods.cfg
cfg_file=/usr/local/nagios/etc/objects/templates.cfg
新添加下面4句,指向子文件所在位置
cfg_file=/usr/local/nagios/etc/hosts.cfg
cfg_file=/usr/local/nagios/etc/hostgroups.cfg
cfg_file=/usr/local/nagios/etc/contactgroups.cfg
cfg_file=/usr/local/nagios/etc/services.cfg
# Definitions for monitoring the local (Linux) host
#cfg_file=/usr/local/nagios/etc/objects/localhost.cfg #注释掉,因为有了hosts.cfg文件
command_check_interval=10s
#command_check_interval=-1 #原来为-1,改成10s
3.由上一步新添加的4句,创建文件hosts.cfg hostgroup.cfg contactgroups.cfg services.cfg
4.配置hosts.cfg hostgroup.cfg contactgroups.cfg
# cat hosts.cfg
define host {
host_name nagios-server #与hostgroup.cfg定义的保持一致
alias nagios server
address 192.168.0.13 #被监控主机IP
contact_groups sagroup #监控用户所在的组名,在contactgroups.cfg定义
check_command check-host-alive #此为一个命令,在objects/commands.cfg中有定义,必须有定义
max_check_attempts 5 #检测次数,一般为3-5次
notification_interval 10 #检测时间间隔,单位为分钟,根据自己情况定
notification_period 24x7 #代表不间断的检测,不能为*,只能为x,下同
notification_options d,u,r #此为状态描述,d-down,u-unreacheable,r-recovery
}
----------------------------------------------------
# cat hostgroup.cfg 定义组与组成员
define hostgroup {
hostgroup_name sa-servers
alias sa servers
members nagios-server #(如果有多用户,可以以“,”分隔,不能有空格)
}
----------------------------------------------------
# cat contactgroups.cfg
define contactgroup {
contactgroup_name sagroup
alias system administrator group
members nagiosadmin
}
--------------------
5.配置cgi.cfg
# cat cgi.cfg
use_authentication=0 #改成0表示不对用户进行cgi验证
authorized_for_system_information=nagiosadmin #因为当时创建的管理用户就是nagiosadmin,所以此处不用修改,如果创建用户为其他
,则要修改,如果创建多个用户,可以用“,”分隔。
authorized_for_configuration_information=nagiosadmin
authorized_for_system_commands=nagiosadmin # * 此处即使是其他用户,也不能改动。*
authorized_for_all_services=nagiosadmin
authorized_for_all_hosts=nagiosadmin
authorized_for_all_service_commands=nagiosadmin
authorized_for_all_host_commands=nagiosadmin
6.配置nrpe.cfg
# cat nrpe.cfg | sed -n '/^[^#]/p'
log_facility=daemon
pid_file=/var/run/nrpe.pid
server_port=5666 #端口号,可以改动
nrpe_user=nagios
nrpe_group=nagios
allowed_hosts=127.0.0.1,192.168.0.13 #此处是可以连接管理此主机的服务器,也就是监控服务器的IP
dont_blame_nrpe=0
debug=0
command_timeout=60
connection_timeout=300
#下面是定义的命令
command[check_users]=/usr/local/nagios/libexec/check_users -w 5 -c 10 #连接用户数,超过5个warning,10个Cirtical(严重)
command[check_load]=/usr/local/nagios/libexec/check_load -w 15,10,5 -c 30,25,20 #负载情况,三个数表示,当前,5分钟内,15分
钟内
command[check_zombie_procs]=/usr/local/nagios/libexec/check_procs -w 5 -c 10 -s Z #使用内存
command[check_total_procs]=/usr/local/nagios/libexec/check_procs -w 150 -c 200 #总内存
command[check_swap]=/usr/local/nagios/libexec/check_swap -w 20% -c 10% #交换分区使用率
command[check_disk]=/usr/local/nagios/libexec/check_disk -w 20% -c 10% -p /dev/sda3 #磁盘分区使用率
还可以自己定义,通过写脚本来完成,后面再来补充。
7.配置objects/contacts.cfg
# cat objects/contacts.cfg
define contact{
contact_name nagiosadmin
alias system administrator
service_notification_period 24x7
host_notification_period 24x7
service_notification_options w,u,c,r #代表Warning,Unknown,Critical,recovery
host_notification_options d,u,r
service_notification_commands notify-service-by-fetion,notify-service-by-sms #指明报警方式
host_notification_commands notify-host-by-fetion,notify-host-by-sms #同上
email **********@139.com
pager 152******13
}
8.配置 objects/commands.cfg
# cat objects/commands.cfg (一定要定义的列出,其他的不必要变动)
# 'check-host-alive' define command
define command{
command_name check-host-alive
command_line $USER1$/check_ping -H $HOSTADDRESS$ -w 3000.0,80% -c 5000.0,100% -p 5
}
# 'check_nrpe' define command 这个是要自己定义的,很重要,会影响到services.cfg中的配置
define command{
command_name check_nrpe
command_line $USER1$/check_nrpe -H $HOSTADDRESS$ -c $ARG1$ # $ARG1$表示check_nrpe后面的命令,如:check_disk
}
# 'notify-host-by-fetion' command definition 飞信报警配置
define command{
command_name notify-host-by-fetion
command_line /usr/local/fetion/fetion --mobile=152******** --pwd=******** --to $CONTACTPAGER$ --msg-
utf8="$HOSTNAME$ is $HOSTSTATE$" --debug
}
# 'notify-service-by-email' command definition
define command{
command_name notify-service-by-fetion
command_line /usr/local/fetion/fetion --mobile=152******** --pwd=******** --to $CONTACTPAGER$ --msg-
utf8="$NOTIFICATIONTYPE$: $HOSTALIAS$/$SERVICEDESC$ IS $SERVICESTATE$" --debug
}
# 'notify-host-by-sms' command definition 邮件报警配置
define command {
command_name notify-host-by-sms
command_line /usr/bin/printf "%b" "***** Nagios *****\n\nNotification Type: $NOTIFICATIONTYPE$\nHost:
$HOSTNAME$\nState: $HOSTSTATE$\nAddress: $HOSTADDRESS$\nInfo: $HOSTOUTPUT$\n\nDate/Time: $LONGDATETIME$\n" |
/usr/local/sendEmail/sendEmail -s "** $NOTIFICATIONTYPE$ Host Alert: $HOSTNAME$ is $HOSTSTATE$ **" $CONTACTEMAIL$
}
# 'notify-service-by-sms' command definition
define command {
command_name notify-service-bysms
command_line /usr/bin/printf "%b" "***** Nagios *****\n\nNotification Type: $NOTIFICATIONTYPE$\n\nService:
$SERVICEDESC$\nHost: $HOSTALIAS$\nAddress: $HOSTADDRESS$\nState: $SERVICESTATE$\n\nDate/Time: $LONGDATETIME$\n\nAdditional
Info:\n\n$SERVICEOUTPUT$" | /usr/local/sendEmail/sendEmail -s "** $NOTIFICATIONTYPE$ Service Alert: $HOSTALIAS$/
$SERVICEDESC$ is $SERVICESTATE$ **" $CONTACTEMAIL$
}
9.配置services.cfg
#cat services.cfg
###nagios-server:services.cfg###
define service {
host_name nagios-server #主机名一定要与hosts.cfg文件中的定义保持一致
service_description check-host-alive
check_period 24x7
max_check_attempts 4
normal_check_interval 3
retry_check_interval 2
contact_groups sagroup
notification_interval 10
notification_period 24x7
notification_options w,u,c,r
check_command check-host-alive #命令为objects/commands.cfg中已经定义的
}
define service {
host_name nagios-server
service_description check_tcp 80
check_period 24x7
max_check_attempts 4
normal_check_interval 3
retry_check_interval 2
contact_groups sagroup
notification_interval 10
notification_period 24x7
notification_options w,u,c,r
check_command check_tcp!80 #感叹号后面为参数
}
define service {
host_name nagios-server
service_description check_local_disk
check_period 24x7
max_check_attempts 4
normal_check_interval 3
retry_check_interval 2
contact_groups sagroup
notification_interval 10
notification_period 24x7
notification_options w,u,c,r
#check_command check_local_disk!20%!10%!/
check_command check_nrpe!check_disk
}
define service {
host_name nagios-server
service_description check_load
check_period 24x7
max_check_attempts 4
normal_check_interval 3
retry_check_interval 2
contact_groups sagroup
notification_interval 10
notification_period 24x7
notification_options w,u,c,r
check_command check_nrpe!check_load
}
define service {
host_name nagios-server
service_description check_total_procs
check_period 24x7
max_check_attempts 4
normal_check_interval 3
retry_check_interval 2
contact_groups sagroup
notification_interval 10
notification_period 24x7
notification_options w,u,c,r
check_command check_nrpe!check_total_procs
}
define service {
host_name nagios-server
service_description check_users
check_period 24x7
max_check_attempts 4
normal_check_interval 3
retry_check_interval 2
contact_groups sagroup
notification_interval 10
notification_period 24x7
notification_options w,u,c,r
check_command check_nrpe!check_users
}
此处定义监控6个服务,如果要监控其他主机的服务,也要在这里定义,下面会提到。
10.此时配置完成了一大步,以后再配置也是在这个基础上,会很容易了。
下面就要启动nrpe,重启nagios来检测配置是否成功!
#/usr/local/nagios/bin/nagios -v /usr/local/nagios/etc/nagios.cfg
Checking for circular paths between hosts...
Checking for circular host and service dependencies...
Checking global event handlers...
Checking obsessive compulsive processor commands...
Checking misc settings...
Total Warnings: 0
Total Errors: 0
出现此处,表明,配置文件没有错误,可以启动nagios
#service nagios restart 启动成功
# /usr/local/nagios/bin/nrpe -c /usr/local/nagios/etc/nrpe.cfg -d
# tail -f /var/log/messages
Jul 22 16:25:16 localhost nrpe[14911]: Starting up daemon
Jul 22 16:25:16 localhost nrpe[14911]: Listening for connections on port 5666
Jul 22 16:25:16 localhost nrpe[14911]: Allowing connections from: 127.0.0.1,192.168.0.13
日志信息出现如上,表明启动成功,测试一下
# /usr/local/nagios/libexec/check_nrpe -H 192.168.0.13
NRPE v2.12 会显示nrpe版本号
# /usr/local/nagios/libexec/check_nrpe -H 192.168.0.13 -c check_disk
DISK OK - free space: / 242377 MB (87% inode=99%);| /=34099MB;233219;262371;0;291524