#!/bin/sh
# health check script for Torque on CMM Clusters. Tested with Torque 2.3.6.
# use in combination with:
# $node_check_interval 40,jobstart,jobend
# this will check the node's viability about once every half hour
# as well as at the beginning and end of each job

DMESG=/bin/dmesg
LSPCI=/sbin/lspci
IBVDEVINFO=/opt/ofed-1.3.1/bin/ibv_devinfo
PBSNODES=/opt/bin/pbsnodes
LONGHOSTNAME=`/bin/hostname`
HOSTNAME=`echo $LONGHOSTNAME | /bin/cut -d . -f 1`

# helper function. take node offline and log the event, if not already offline.
take_offline () {
  $PBSNODES -l $HOSTNAME | grep -q offline && return
  /usr/bin/logger -p local0.error $LONGHOSTNAME taken offline due to $*.
  $PBSNODES -o $HOSTNAME
}

# for debugging health_check script functions you can manually
# disable nodes with /var/spool/PBS/mom_priv/test_dead file
testbad=0
[ -f /var/spool/PBS/mom_priv/test_dead ] && testbad=1
if [ 0 != ${testbad} ]
then
  errmsg="health check script test"
  take_offline $errmsg
fi

# disable nodes with non-responding IB cards.
badib=0
if $LSPCI | grep -q -i infiniband
then
  [ 2 = `$IBVDEVINFO -v -d mthca0 | egrep -c 'PORT_ACTIVE|LINK_UP'` ] || \
  [ 2 = `$IBVDEVINFO -v -d ipath0 | egrep -c 'PORT_ACTIVE|LINK_UP'` ] || badib=1
  if [ 0 != ${badib} ]
  then
    errmsg="error in IB status"
    take_offline $errmsg
  fi
fi

# disable nodes with non-responding Myrinet cards.
badmyr=0
[ 0 = `$DMESG | grep -c "GM:.*LANai.*interface not responding"` ] || badmyr=1
if [ 0 != ${badmyr} ]
then
  errmsg="not responding Myrinet card"
  take_offline $errmsg
fi

# disable nodes with "Machine check events logged"
badhw=0
[ 0 = `$DMESG | grep -i -c 'machine check events logged'` ] || badhw=1
if [ 0 != ${badhw} ]
then
  errmsg="bad hardware. check /var/log/mcelog"
  take_offline $errmsg
fi

# report error for torque/maui. results in "message" node property added.
if [ 0 -lt `expr ${testbad} + ${badmyr} + ${badhw}` ]
then
  echo ERROR
  echo errmsg="${errmsg}"
fi 
