#!/bin/bash # Description: # This script aids management of condor compute nodes. # Does a peaceful shutdown of condor, reboots the node, and then # restarts condor on node. #Usage: # condor_reboot {node} if [ "$#" -eq "0" ]; then echo echo Usage: echo condor_reboot '{node}' echo ' where node is the name in condor (public name)' exit 1 fi # if not launched with --child option, relaunch in background # to run in foreground (for debugging for instance) give the --child option (first) if [ "x$1" != "x--child" ]; then $0 --child $@ >> /tmp/condor_waiter.$$.out 2>&1 & exit 0 fi # shift off the --child option shift # get node names INPUT_NODE_NAME=$1 # get private nodename PRIVATE_NODE_NAME=`python /home/install/extras/nodeinfo/nodeinfo-get.py $INPUT_NODE_NAME NAME_PRIVATE` # get public nodename PUBLIC_NODE_NAME=`python /home/install/extras/nodeinfo/nodeinfo-get.py $INPUT_NODE_NAME NAME_PUBLIC` #echo PRIVATE $PRIVATE_NODE_NAME #echo PUBLIC $PUBLIC_NODE_NAME # send condor_off ssh msu-osg "condor_off -peaceful $PUBLIC_NODE_NAME" # this could be reworked using skill and condor_startd process name... # could also check for Claimed slots with: # condor_status c-113-4 | grep Claimed startd_pid=`ssh $PRIVATE_NODE_NAME "ps h -o pid -C condor_startd"` echo $startd_pid timestart=$(date -u +%s) if [ "x$startd_pid" != "x" ]; then logger -s -- condor_reboot $$: condor_startd PID $startd_pid is running waiting for it to end before rebooting node $PRIVATE_NODE_NAME # wait for condor_master to go away # kill -0 is just a way to check if the process is alive # RESULT will be 0 if process exists RESULT=0 while [ "$RESULT" -eq "0" ]; do echo "sleeping 120" sleep 120 # note double single-quotes. want single quotes sent to ssh so that $? is intepreted on target RESULT=`ssh $PRIVATE_NODE_NAME ''kill -0 $startd_pid 2> /dev/null; echo $?''` done else logger -s -- condor_reboot: condor_startd was not running on $PRIVATE_NODE_NAME at my startup fi timeend=$(date -u +%s) logger -s -- condor_reboot $$: condor_startd $startd_pid has gone away after $((timeend-timestart)) secs will now reboot node $PRIVATE_NODE_NAME ssh $PRIVATE_NODE_NAME "shutdown -r now" # a clean reboot takes a bit less than 2 minutes sleep 300 logger -s -- condor_reboot $$: sending condor_on to msu-osg for $PUBLIC_NODE_NAME ssh msu-osg "condor_on $PUBLIC_NODE_NAME" # need some logic here to verify we are done...