#!/bin/bash

# Description:
#   This script aids management of condor compute nodes.  

#   Does a peaceful shutdown of condor, reboots the node, and then
#   restarts condor on node.

#Usage:
# condor_reboot {node}

if [ "$#" -eq "0" ]; then
  echo
  echo Usage:
  echo condor_reboot '{node}'
  echo '  where node is the name in condor (public name)'
  exit 1
fi

# if not launched with --child option, relaunch in background
# to run in foreground (for debugging for instance) give the --child option (first)
if [ "x$1" != "x--child" ]; then
  $0 --child $@ >> /tmp/condor_waiter.$$.out 2>&1 &
  exit 0
fi

# shift off the --child option
shift

# get node names
INPUT_NODE_NAME=$1

# get private nodename
PRIVATE_NODE_NAME=`python /home/install/extras/nodeinfo/nodeinfo-get.py $INPUT_NODE_NAME NAME_PRIVATE`

# get public nodename
PUBLIC_NODE_NAME=`python /home/install/extras/nodeinfo/nodeinfo-get.py $INPUT_NODE_NAME NAME_PUBLIC`

#echo PRIVATE $PRIVATE_NODE_NAME
#echo PUBLIC $PUBLIC_NODE_NAME

# send condor_off
ssh msu-osg "condor_off -peaceful $PUBLIC_NODE_NAME" 

# this could be reworked using skill and condor_startd process name...
# could also check for Claimed slots with:
#   condor_status c-113-4 | grep Claimed
#startd_pid=`ssh $PRIVATE_NODE_NAME "ps h -o pid -C condor_startd"`
#echo $startd_pid

#timestart=$(date -u +%s)

#if [ "x$startd_pid" != "x" ]; then
#    logger -s -- condor_reboot $$: condor_startd PID $startd_pid is running waiting for it to end before rebooting node $PRIVATE_NODE_NAME

    # wait for condor_master to go away
    # kill -0 is just a way to check if the process is alive
    # RESULT will be 0 if process exists
#    RESULT=0
#    while [ "$RESULT" -eq "0" ]; do
#      echo "sleeping 120"
#      sleep 120
      # note double single-quotes.  want single quotes sent to ssh so that $? is intepreted on target
#      RESULT=`ssh $PRIVATE_NODE_NAME ''kill -0 $startd_pid 2>/dev/null; echo $?''`
#    done
#else
#    logger -s -- condor_reboot: condor_startd was not running on $PRIVATE_NODE_NAME at my startup
#fi

timestart=$(date -u +%s)

RESULT=`ssh $PRIVATE_NODE_NAME 'skill -v condor_startd | wc -l'`
echo BEFORE RESULT is $RESULT
while [ "$RESULT" -ne "0" ]; do
  #echo "sleeping 120"
  sleep 120
  # note double single-quotes.  want single quotes sent to ssh so that $? is intepreted on target
  RESULT=`ssh $PRIVATE_NODE_NAME 'skill -v condor_startd | wc -l'`
  echo RESULT is $RESULT
done

timeend=$(date -u +%s)
logger -s -- condor_reboot $$: condor_startd $startd_pid has gone away after $((timeend-timestart)) secs will now reboot node $PRIVATE_NODE_NAME
ssh $PRIVATE_NODE_NAME "shutdown -r now"

# a clean reboot takes a bit less than 2 minutes
sleep 300

logger -s -- condor_reboot $$: sending condor_on to msu-osg for $PUBLIC_NODE_NAME
ssh msu-osg "condor_on $PUBLIC_NODE_NAME"

# need some logic here to verify we are done...