目标:监控WebLogic Server的服务状态,线程繁忙程度,java的内存使用率,数据库连接池的状态。
主配置文件:
--mon81.cfg--
#系统缩写 NodeName 服务名 ip地址 端口 服务时间 用户/密码 系统名称 空闲线程个数报警 内存报警比率
Monitor Moniter appServer1 192.168.1.168 8001 0:00-23:59 weblogic/weblogic 服务监控 0 95
生成的py配置文件头部分
--base81.tmp--
from java.util import Date
from java.text import SimpleDateFormat
import time
username='weblogic'
password='weblogic'
urldict={"appServer1":"t3://192.168.1.168:8001"}
生成的py配置文件尾部分
--base81.py--
curTime=''
TotalCount=1 # times of loop
sleeptime=3 # seconds
def monall():
for sname,surl in urldict.items():
try:
connect(username,password,surl)
except:
dumpStack()
continue
try:
resEQ=getEQ(sname)
ff1=open("eq.txt","a+")
ff1.write(resEQ)
ff1.close()
except:
dumpStack()
try:
resDB=getDBPool()
f1=open("db.txt","a+")
f1.write(resDB)
f1.close()
except:
dumpStack()
try:
resMem=mon_jvm_heap()
fMem=open("mem.txt","a+")
fMem.write(resMem)
fMem.close()
except:
dumpStack()
disconnect()
def mon_jvm_heap():
jvmrtlist=home.getMBeansByType('JVMRuntime')
for jvmRT in jvmrtlist:
freejvm = jvmRT.getAttribute("HeapFreeCurrent")/1048576
totaljvm = jvmRT.getAttribute("HeapSizeCurrent")/1048576
usedjvm = (totaljvm - freejvm)
val=str(freejvm)+'\t'+str(totaljvm)+'\t'+str(usedjvm)+'\t'+curTime+'\n'
return val;
def getEQ(sname):
try:
ser=home.getRuntimeMBean(sname,"ServerRuntime")
eq=ser.getExecuteQueueRuntimes()
for eqRT in eq:
eqname = eqRT.getAttribute("Name")
if eqname=='weblogic.kernel.Default':
eqtthreads = eqRT.getAttribute("ExecuteThreadTotalCount")
eqithreads = eqRT.getAttribute("ExecuteThreadCurrentIdleCount")
eqloc=(eqRT.getObjectName()).getLocation()
val=str(eqtthreads)+'\t'+str(eqithreads)+'\t'+eqloc+'\t'+curTime+'\n'
return val
except:
dumpStack()
return "-1"+'\t'+"-1"+'\t'+"-1"+'\t'+curTime+'\n'
def getDBPool():
val=''
poolrtlist=home.getMBeansByType('JDBCConnectionPoolRuntime')
for poolRT in poolrtlist:
pname = poolRT.getName()
pmaxcapacity = poolRT.getAttribute("MaxCapacity")
paccc = poolRT.getAttribute("ActiveConnectionsCurrentCount")
pachc = poolRT.getAttribute("ActiveConnectionsHighCount")
pwshc = poolRT.getAttribute("WaitSecondsHighCount")
pwfccc = poolRT.getAttribute("WaitingForConnectionCurrentCount")
pstate = poolRT.getAttribute("State")
ploc= (poolRT.getObjectName()).getLocation()
val=val+ploc+'\t'+str(pmaxcapacity)+'\t'+str(paccc)+'\t'+str(pachc)+'\t'+str(pwshc)+'\t'+str(pwfccc)+'\t'+pstate+'\t
'+pname+'\t'+curTime+'\n'
return val
try:
num=0
while num num=num+1
try:
curTime=time.strftime('%Y-%m-%d %H:%M:%S',time.localtime()) # 2009-12-08 16:18:25
monall()
except:
dumpStack()
time.sleep(sleeptime) #seconds
except:
dumpStack()
生成py配置文件的shell
--config-81.sh--
#!/sbin/sh
##读取配置文件/weblogic/config/mon81.cfg
##配置文件的内容为 1系统名称缩写 2servername 3服务ip地址 4服务端口 5监控时间 6用户 7密码 8系统中文名称
configDir=/weblogic/config/wlst81
configPY=/weblogic/properties/wlst81
configFile=$configDir/mon81.cfg
countFile=$configDir/cnt81.cfg
basePyFile=$configDir/base81.py
configCnt=$configDir/configCnt81.cfg
lineNum=`cat $configCnt |awk '{ print $1 }'`
count=`wc -l $configFile |awk '{ print $1 }'`
echo `expr $count + 1` > $configCnt
>$countFile
while true
do
if [ $lineNum -gt $count ]
then
break
fi
echo "from java.util import Date" >base81.tmp
echo "from java.text import SimpleDateFormat" >>base81.tmp
echo "import time" >>base81.tmp
echo " ">>base81.tmp
systemName=`sed -n ${lineNum}p $configFile |awk '{ print $1 }'`
NodeName=`sed -n ${lineNum}p $configFile |awk '{ print $2 }'`
serverName=`sed -n ${lineNum}p $configFile |awk '{ print $3 }'`
serverIP=`sed -n ${lineNum}p $configFile |awk '{ print $4 }'`
serverPort=`sed -n ${lineNum}p $configFile |awk '{ print $5 }'`
userName=`sed -n ${lineNum}p $configFile |awk '{ print $7 }'`
password=`sed -n ${lineNum}p $configFile |awk '{ print $8 }'`
noteline=`echo $systemName | awk -F "#" '{print $1}'`
if [ "$noteline" = "" ]
then
lineNum=`expr $lineNum + 1`
continue
fi
echo "ServerName,IP,Port is $serverName $serverIP $serverPort "
if [ "$userName" = "0" ]; then
echo "username='weblogic'">>base81.tmp
else
echo "username='$userName'">>base81.tmp
fi
if [ "$password" = "0" ]; then
echo "password='weblogic'">>base81.tmp
else
echo "password='${password}'">>base81.tmp
fi
if [ "$serverName" = "" ]
then
continue
fi
echo "urldict={\"$serverName\":\"t3://$serverIP:$serverPort\"}">>base81.tmp
cp base81.tmp $configPY/$systemName-$NodeName-$serverName.py
cat $basePyFile >>$configPY/$systemName-$NodeName-$serverName.py
echo "$systemName $NodeName $serverName 0 0 0 0 0 0 ">>$countFile
lineNum=`expr $lineNum + 1`
done
设置环境变量的shell
--setenv.sh--
export WLST_HOME="/weblogic/weblogic81/monitor"
CLASSPATH=$WLST_HOME/weblogic.jar:$WLST_HOME/jython.jar:$WLST_HOME/wlst.jar
PATH=/opt/java1.4/bin:$PATH:/oracle/product/9.2.0/jdk/bin
export CLASSPATH
echo
echo $CLASSPATH
echo
echo $PATH
echo
主轮询监控shell
--monitor-81.sh--
#!/sbin/sh
#配置文件目录和输出文件目录
configDir=/weblogic/config/wlst81
configPY=/weblogic/properties/wlst81
outputDir=/weblogic/output
#监控server状态信息文件,目前三个域 serverName 连续fail次数 连续忙次数
cntFile=$configDir/cnt81.cfg
#配置文件名称
configFile=$configDir/mon81.cfg
tempFile=$configDir/tmp81.cfg
#故障重复出现warningCnt次系统报警配置文件
warningConfFile=$configDir/warnCnt81.cfg
#异常信息保存的文件
errFile=$outputDir/web_err.out
#监控结果(线程和数据库连接池)的输出文件
srvTmpOutFile=srv.txt
eqTmpOutFile=eq.txt
dbTmpOutFile=db.txt
memTmpOutFile=mem.txt
runEnv=`env |grep WLST_HOME= |wc -l`
if [ $runEnv -eq 0 ]
then
. ./setenv.sh
fi
#执行WLST产生监控数据
Execute_WLST()
{
nohup java weblogic.WLST $1 >>nohup.out &
sleep 2
fail_flag=0
loop_count=0
while true
do
finished=`ps -exf |grep java |grep WLST |grep $1 |wc -l`
if [ $finished -eq 0 ]
then
break
fi
loop_count=`expr $loop_count + 1`
#持续监测35秒钟,如果发现还没有运行完毕,表明服务异常,执行kill监控进程操作
if [ $loop_count -gt 35 ]
then
fail_flag=1
pid=`ps -exf |grep java |grep WLST |grep $1|awk '{print $2} ' `
kill -9 $pid
break
fi
sleep 1
done
if [ $fail_flag -eq 0 ]
then
sleep 2
count=`wc -l $2 |awk ' {print $1}'`
echo $count
else
echo "0"
fi
}
#### Print_Error_info $systemName $serverName $chnsysName $type $NodeName
Print_Error_info()
{
banner "$1 $2"
if [ "$4" = "serverType" ]
then
banner "down!!"
echo "$1($3)@$5-$2 服务异常!! "
echo "`date +%Y-%m-%d\ %H:%M:%S` $1($3)@$5-$2 Fail ">>$outputDir/$1-$5-$2.srv.out
echo "ERROR $5 `date +%Y-%m-%d\ %H:%M:%S` $1($3)@$5-$2 Server IS Down!">>$errFile
fi
if [ "$4" = "threadType" ]
then
banner "thread busy"
echo "$1($3)@$5-$2 服务线程忙!! "
echo "`date +%Y-%m-%d\ %H:%M:%S` $1($3)@$5-$2 thread busy ">>$outputDir/$1-$5-$2.eq.out
echo "ERROR $5 `date +%Y-%m-%d\ %H:%M:%S` $1($3)@$5-$2 Thread IS Busy!">>$errFile
fi
if [ "$4" = "memoryType" ]
then
banner "mem high"
echo "$1($3)@$5-$2 java内存使用率高!! "
echo "`date +%Y-%m-%d\ %H:%M:%S` $1($3)@$5-$2 memory high ">>$outputDir/$1-$5-$2.mem.out
echo "ERROR $5 `date +%Y-%m-%d\ %H:%M:%S` $1($3)@$5-$2 Java MEM_ARGS Used Higher!">>$errFile
fi
if [ "$4" = "dbpoolType" ]
then
banner "dbpool busy"
echo "$1($3)@$5-$2 数据库连接池忙!! "
echo "`date +%Y-%m-%d\ %H:%M:%S` $1($3)@$5-$2 dbpool busy ">>$outputDir/$1-$5-$2.db.out
echo "ERROR $5 `date +%Y-%m-%d\ %H:%M:%S` $1($3)@$5-$2 DBConnectPool IS Busy!">>$errFile
fi
}
while true
do
#读取报警次数配置文件信息
serverWarningCnt=`sed -n /serverWarning/p $warningConfFile |awk '{ print $2 }'`
memoryWarningCnt=`sed -n /memoryWarning/p $warningConfFile |awk '{ print $2 }'`
dbpoolWarningCnt=`sed -n /dbpoolWarning/p $warningConfFile |awk '{ print $2 }'`
threadWarningCnt=`sed -n /threadWarning/p $warningConfFile |awk '{ print $2 }'`
mon_time=`date +%H`
if [ $mon_time -gt 22 -o $mon_time -lt 7 ]
then
serverWarningCnt=`expr $serverWarningCnt + 3`
memoryWarningCnt=`expr $memoryWarningCnt + 3`
dbpoolWarningCnt=`expr $dbpoolWarningCnt + 3`
threadWarningCnt=`expr $threadWarningCnt + 3`
fi
lineNum=2
#监控服务的个数
cfgCount=`wc -l $configFile |awk '{ print $1 }' `
while true
do
#读取一行配置信息:系统缩写、主机名、服务名、监控时间段、中文名称、空闲线程个数报警、内存使用报警比率
systemName=`sed -n ${lineNum}p $configFile |awk '{ print $1 }'`
NodeName=`sed -n ${lineNum}p $configFile |awk '{ print $2 }'`
serverName=`sed -n ${lineNum}p $configFile |awk '{ print $3 }'`
sysmonTime=`sed -n ${lineNum}p $configFile |awk '{ print $6 }'`
chnsysName=`sed -n ${lineNum}p $configFile |awk '{ print $9 }'`
idleThread=`sed -n ${lineNum}p $configFile |awk '{ print $10 }'`
memUsedPer=`sed -n ${lineNum}p $configFile |awk '{ print $11 }'`
memUsedWar=`sed -n ${lineNum}p $configFile |awk '{ print $12 }'`
if [ $lineNum -gt $cfgCount ]
then
break
fi
#检查配置文件是否为注释行,如果注释行则不做操作,行号加1。
noteline=`echo $systemName | awk -F "#" '{print $1}'`
if [ "$noteline" = "" ]
then
lineNum=`expr $lineNum + 1`
echo $lineNum
continue
fi
if [ $lineNum -gt $cfgCount ]
then
break
fi
#如果不是7*24小时的系统,在非工作时间内不监控
if [ "$sysmonTime" != "0-0" -a "$sysmonTime" != "" ]
then
CURDAY=`date +%a`
CURTIME=`date +%H`
startTime=`echo $sysmonTime|awk -F "-" '{print $1}'`
endTime=`echo $sysmonTime|awk -F "-" '{print $2}'`
#在非工作时间段内不监控
if [ $CURDAY = 'Sat' -o $CURDAY = 'Sun' -o $CURTIME -lt $startTime -o $CURTIME -ge $endTime ]
then
lineNum=`expr $lineNum + 1`
continue
fi
fi
###已经读取到文件末尾
if [ "$serverName" = "" ]
then
break
fi
#读取这个服务的历史状态信息,从cnt.cfg文件中读取连续fail次数 连续忙次数 数据库连接池忙次数等
serverFailCnt=`sed -n /$systemName\ $NodeName\ $serverName/p $cntFile |awk '{ print $4 } '`
threadBusyCnt=`sed -n /$systemName\ $NodeName\ $serverName/p $cntFile |awk '{ print $5 } '`
dbPoolBusyCnt=`sed -n /$systemName\ $NodeName\ $serverName/p $cntFile |awk '{ print $6 } '`
memoryHighCnt=`sed -n /$systemName\ $NodeName\ $serverName/p $cntFile |awk '{ print $7 } '`
echo "######################### For WebLogic-81 Check #########################"
DATE_TIME=`/usr/bin/date "+%Y-%m-%d %H:%M:%S"`
echo "@@@@@@@@@@@@@@@@@@@@@@@@@@ $DATE_TIME @@@@@@@@@@@@@@@@@@@@@@@@@@"
>$eqTmpOutFile
>$dbTmpOutFile
>$memTmpOutFile
idleCount=1
#获取当前服务的状态信息
count=`Execute_WLST $configPY/$systemName-$NodeName-$serverName.py $eqTmpOutFile`
#保留监控信息
cat $eqTmpOutFile >>$outputDir/$systemName-$NodeName-$serverName.eq.out
cat $dbTmpOutFile >>$outputDir/$systemName-$NodeName-$serverName.db.out
cat $memTmpOutFile>>$outputDir/$systemName-$NodeName-$serverName.mem.out
###server服务不正常
if [ $count -eq 0 ]
then
loopcount=0
###如果服务不正常要继续检测两次
while true
do
count=`Execute_WLST $configPY/$systemName-$NodeName-$serverName.py $eqTmpOutFile`
cat $eqTmpOutFile >>$outputDir/$systemName-$NodeName-$serverName.eq.out
cat $dbTmpOutFile >>$outputDir/$systemName-$NodeName-$serverName.db.out
cat $memTmpOutFile>>$outputDir/$systemName-$NodeName-$serverName.mem.out
if [ $count -gt 0 ]
then
serverFailCnt=0
threadCount=`cat $eqTmpOutFile |awk ' {print $1} '`
idleCount=`cat $eqTmpOutFile |awk ' {print $2} '`
echo "$systemName($chnsysName)@$NadeName:"
echo "$serverName $threadCount $idleCount"| awk '{printf "%-15s服务线程总数 : %-22s空闲线程数 : %-5s\n",$1,$2,$3}'
break
fi
if [ $loopcount -ge 2 ]
then
break
fi
loopcount=`expr $loopcount + 1`
done ###循环检测两次完成
if [ $count -eq 0 ]
then
serverFailCnt=`expr $serverFailCnt + 1`
fi
echo " Fail count is $serverFailCnt "
###连续多次服务不正常报警
if [ $serverFailCnt -ge $serverWarningCnt ]
then
Print_Error_info $systemName $serverName $chnsysName serverType $NodeName
fi
else
serverFailCnt=0
threadCount=`cat $eqTmpOutFile |awk ' {print $1} '`
idleCount=`cat $eqTmpOutFile |awk ' {print $2} '`
echo "$systemName($chnsysName)@$NodeName:"
echo "$serverName $threadCount $idleCount"| awk '{printf "%-15s服务线程总数 : %-22s空闲线程数 : %-5s\n",$1,$2,$3}'
fi
###没有空闲线程,如果连续3次没有空闲线程报警
if [ $idleCount -le $idleThread ]
then
threadBusyCnt=`expr $threadBusyCnt + 1`
if [ $threadBusyCnt -ge $threadWarningCnt ]
then
Print_Error_info $systemName $serverName $chnsysName threadType $NodeName
fi
else
threadBusyCnt=0
fi
###处理java内存信息情况
memoryCount=`wc -l $memTmpOutFile |awk ' {print $1}'`
###如果读取不到信息不做处理
if [ $memoryCount -gt 0 ]
then
#val=str(freejvm)+'\t'+str(totaljvm)+'\t'+str(usedjvm)+'\n'
freeMem=`cat $memTmpOutFile |awk ' {print $1} '`
totalMem=`cat $memTmpOutFile |awk ' {print $2} '`
usedMem=`cat $memTmpOutFile |awk ' {print $3} '`
usedPer=`expr $usedMem \* 100 / $totalMem`
echo "$serverName ${totalMem}M ${freeMem}M" | awk '{printf "%-15sjava内存总数 : %-22s空闲内存数 : %-5s\n",$1,$2,$3}'
##出现java内存使用百分率大于设定值
if [ $usedPer -gt $memUsedPer -a $usedMem -gt $memUsedWar ]
then
memoryHighCnt=`expr $memoryHighCnt + 1`
if [ $memoryHighCnt -ge $memoryWarningCnt ]
then
Print_Error_info $systemName $serverName $chnsysName memoryType $NodeName
fi
else
memoryHighCnt=0
fi
fi
##处理数据库连接池忙,处理数据库连接池信息,可能出现一个服务多个数据库连接池
poolCount=`wc -l $dbTmpOutFile |awk ' {print $1}'`
dbCount=1
###如果读取不到信息不做处理
if [ $poolCount -gt 0 ]
then
while true
do
##取出一个dbpool的信息
curWaitConnCnt=`sed -n ${dbCount}p $dbTmpOutFile |awk ' {print $6} '`
currConnectCnt=`sed -n ${dbCount}p $dbTmpOutFile |awk ' {print $3} '`
maxsConnectCnt=`sed -n ${dbCount}p $dbTmpOutFile |awk ' {print $2} '`
dbPoolConnName=`sed -n ${dbCount}p $dbTmpOutFile |awk ' {print $8} '`
idleConnectCnt=`expr $maxsConnectCnt - $currConnectCnt `
echo "$dbPoolConnName $maxsConnectCnt $idleConnectCnt " | awk '{printf "%-15s连接池配置数 : %-22s空闲连接池 : %-5s\n",$1,$2,$3}'
##出现数据库连接池等待
if [ $curWaitConnCnt -gt 0 ]
then
dbPoolBusyCnt=`expr $dbPoolBusyCnt + 1`
if [ $dbPoolBusyCnt -ge $dbpoolWarningCnt ]
then
Print_Error_info $systemName $serverName $chnsysName dbpoolType $NodeName
fi
else
dbPoolBusyCnt=0
fi ### end of 数据库连接池出现等待
dbCount=`expr $dbCount + 1`
if [ $dbCount -gt $poolCount ]
then
break
fi
done
fi
###把监控到的server状态连续异常次数记录到文件中,便于下次读取
sed /$systemName\ $NodeName\ $serverName/d $cntFile >$tempFile #删除原来的信息
echo "$systemName $NodeName $serverName $serverFailCnt $threadBusyCnt $dbPoolBusyCnt $memoryHighCnt " >>$tempFile #添加新的信息
cp $tempFile $cntFile
###已经读取到文件末尾
if [ "$serverName" = "" ]
then
break
fi
lineNum=`expr $lineNum + 1`
done
>nohup.out
done
----需要的jar包 weblogic.jar,wlst.jar,jython.jar----
----weblogic9的监控类似,weblogic9集成了WLST工具,只需安装软件,不用创建domain,稍加配置即可,需要的java版本为1.5----