#!/bin/csh -f
#
#       %W% %E%
#

# current_nodes lists machines
# queued_jobs   lists jobs to be done
# running_jobs  lists jobs presently running with status : running or suspended
# finished_jobs 
# find idle machine

set host = `uname -n`
set curf = curf_${host}
set rendir = ${GIG_D_ROOT}/projects/renmanadm/${curf}
set logfile = ${rendir}/gigrs_logfile
set rendirjobsf = ${rendir}/running_jobs
set rendirjobsf_tmp = ${rendir}/running_jobs_tmp

# first check whether process still exist on the concerning server
# was commented out

set l_frams = 0
set n_entrs = `cat $rendirjobsf | awk 'BEGIN {} {s+=1;} END {print s}' s=0 -`
# echo number of current jobs = $n_entrs
set i = 0

while ( $i < $n_entrs )
  set get_ent = `cat $rendirjobsf | awk 'BEGIN{}{if(s==t){print $0;exit};s+=1;}END{}' s=0 t=$i -`
# echo gigrs_rshell $get_ent[2] gigrs_stillalive $host $get_ent[3] >> ${logfile}
  gigrs_rshell $get_ent[2] "gigrs_stillalive $host $get_ent[3]"

  if ( $status ) then
# node is not reachable
# prepare requeuing of the frame
    set l_frams = 1
    set new_ent = ("lost "$get_ent)
#     echo gigrs_updjobstatus_p $new_ent
    gigrs_updjobstatus_p $new_ent
# delete only this entry from current_nodes
    touch ${rendir}/nodeflag_2
    echo 1 > ${rendir}/nodeturn
    set turn = `cat ${rendir}/nodeturn`
    while ( ( -e ${rendir}/nodeflag_1 ) && ( $turn == '1' ) )
      set turn = `cat ${rendir}/nodeturn`
    end

    fgrep -v "$get_ent[2] $get_ent[3]" $rendir"/current_nodes" >! $rendir"/current_nodes_tmp"
    mv $rendir"/current_nodes_tmp" $rendir"/current_nodes"

    /bin/rm -f ${rendir}/nodeflag_2

    echo $0 $get_ent[2] not reachable >> ${logfile}

  else
# echo sleeping
# echo waiting for ${rendir}/${get_ent[2]}${get_ent[3]} >> ${logfile}

    set nom = 0
    while ( !(-e ${rendir}/${get_ent[2]}${get_ent[3]}) && ( $nom < 200 ) )
      set cnt = 0
      while ( $cnt < 200 )
        @ cnt += 1
      end
      @ nom += 1
    end
    set cnt = 0
    while ( $cnt < 200 )
      @ cnt += 1
    end
# echo waiting done >> ${logfile}
#    gigrs_sleep 10 # synchronization

    set gig_pid = `cat $rendir/${get_ent[2]}${get_ent[3]}`
    /bin/rm ${rendir}/${get_ent[2]}${get_ent[3]}
    if ( $gig_pid[1] != 0 ) then
      if ( ( $gig_pid[1] != $get_ent[3] ) || ( $nom == 200 ) ) then
        echo $0 an error occurred >> ${logfile}
        echo $0 gigdeljob $get_ent this SHOULD NOT happen >> ${logfile}
      else
        gigrs_checkwhours ${get_ent[2]} ${get_ent[3]}
      endif

    else
# proces is gone 
# test whether it is finished
# if finished update
# else requeue and finishframe
      set new_ent = ("unsu "$get_ent)
      gigrs_updjobstatus_p $new_ent
# now if the frame is ready or if an error occured the entry in current_nodes has been removed
# if the entry is still there the frame is not finished and must be requeued

      touch ${rendir}/nodeflag_2
      echo 1 > ${rendir}/nodeturn
      set turn = `cat ${rendir}/nodeturn`
      while ( ( -e ${rendir}/nodeflag_1 ) && ( $turn == '1' ) )
        set turn = `cat ${rendir}/nodeturn`
      end
      set tmp_ent = `fgrep "$get_ent[2] $get_ent[3]" $rendir"/current_nodes" | awk '{print $3}'`

      /bin/rm -f ${rendir}/nodeflag_2
      if ( $tmp_ent == $get_ent[3] ) then
# prepare requeuing of the frame
        set l_frams = 1
        set new_ent = ("lost "$get_ent)
#        echo gigrs_updjobstatus_p $new_ent
        gigrs_updjobstatus_p $new_ent
# delete only this entry from current_nodes
        touch ${rendir}/nodeflag_2
        echo 1 > ${rendir}/nodeturn
        set turn = `cat ${rendir}/nodeturn`
        while ( ( -e ${rendir}/nodeflag_1 ) && ( $turn == '1' ) )
          set turn = `cat ${rendir}/nodeturn`
        end
        fgrep -v "$get_ent[2] $get_ent[3]" $rendir"/current_nodes" >! $rendir"/current_nodes_tmp"
        mv $rendir"/current_nodes_tmp" $rendir"/current_nodes"
        /bin/rm -f ${rendir}/nodeflag_2
      endif
    endif

  endif
#   echo get_ent $get_ent
  @ i += 1
end

if ( $n_entrs > 0 ) then
  egrep -v 'done' $rendirjobsf >! $rendirjobsf_tmp
  /bin/mv $rendirjobsf_tmp $rendirjobsf
endif

if ( $l_frams == 1 ) then
  echo $0 requeuing .... >> ${logfile}
  egrep "lost $1" $rendirjobsf >! $rendir"/queued_jobs_tmp"
  cat $rendir"/queued_jobs" >> $rendir"/queued_jobs_tmp"
  /bin/mv $rendir"/queued_jobs_tmp" $rendir"/queued_jobs"

  fgrep -v "lost $1" $rendir"/running_jobs" >! $rendir"/running_jobs_tmp"
  mv $rendir"/running_jobs_tmp" $rendir"/running_jobs"

endif

# find idle machine

set idle_ent = `fgrep idle $rendir"/current_nodes" | awk '{print NF; exit}'`

set re_enter = 0

set knt = 1

while ( $idle_ent != '' )

  set mac_ent = `fgrep 'idle' $rendir"/current_nodes" | awk '{print $0; exit}'`

  set idle_mach = $mac_ent[2]

# find work to be done just read in first entry of queued_jobs

  set entry_a = ''
  set com_str = ''
  set entry_a = `cat $rendir"/queued_jobs" | awk '{print NF; exit}'`
  set com_str = `cat $rendir"/queued_jobs" | awk '{{for(i=1;i<NF;i++) print $i}; exit}'`
  set pid_fil = `cat $rendir"/queued_jobs" | awk '{print $NF; exit}'`

  if ( $entry_a != '' ) then

    set stt_str = $com_str[1]
    shift com_str
    shift com_str
    shift com_str

    if ( $#mac_ent > 4 ) then
      set get_date = `date`

      if ( $get_date[1] != Sat && $get_date[1] != Sun ) then
        set get_time = $get_date[4]
        set get_digi = `echo $get_time | cut -c1-2,4-5`
        if ( $mac_ent[5] > $mac_ent[6] ) then
          if ( $get_digi >= $mac_ent[5] || $get_digi < $mac_ent[6] ) then
            set startjob = 1
          else
            set startjob = 0
          endif
        else
          if ( $get_digi >= $mac_ent[5] && $get_digi < $mac_ent[6] ) then
            set startjob = 1
          else
            set startjob = 0
          endif
        endif
      else
        set startjob = 1
      endif

    else
      set startjob = 1
    endif

    set sa_path = $mac_ent[4]

    if ( $startjob > 0 ) then

# echo gigrs_rshell $idle_mach "echo $idle_mach > ${rendir}/${idle_mach}_running_${knt}" >> ${logfile}

      gigrs_rshell $idle_mach "echo $idle_mach > ${rendir}/${idle_mach}_running_${knt}"
      if ( $status ) then
#idle_mach is not reachable
        gigrs_delnodes_p $idle_mach
        echo $0 $idle_mach not reachable >> ${logfile}
        exit
      endif
      while ( !(-e ${rendir}/${idle_mach}_running_${knt}) )
        sleep 1
      end
      /bin/rm ${rendir}/${idle_mach}_running_${knt}

#start up gigrs_startnode com_str on idel_mach 
      if (-e ${rendir}/${idle_mach}${pid_fil} ) then 
        echo /bin/rm ${rendir}/${idle_mach}${pid_fil} >> ${logfile}
        /bin/rm ${rendir}/${idle_mach}${pid_fil}
      endif

      set command = ("gigrs_startnode " $sa_path $host $stt_str $com_str)
# echo gigrs_rshell $idle_mach $command >> ${logfile}
      gigrs_rshell $idle_mach "$command"

#check whether idle_mach did indeed has started rendering
      set anothertry = 20

      while ( $anothertry )
        sleep 2
        @ anothertry -= 1
        if (-e ${rendir}"/"${idle_mach}${pid_fil} ) then 
#update queue entry
          set anothertr2 = 1
          while ( $anothertr2 )
            sleep 1
            set pid_num = `cat ${rendir}"/"${idle_mach}${pid_fil} | awk '{print $1; exit}'`
            if ( $pid_num != '' ) then
              set anothertr2 = 0
            endif
          end
# echo started on $idle_mach ':' $pid_num $com_str $pid_fil >> ${logfile}
          echo runs $idle_mach $pid_num $com_str $pid_fil >> $rendir"/running_jobs"
          egrep -v "$pid_fil" $rendir"/queued_jobs" >! $rendir"/queued_jobs_tmp"
          /bin/mv $rendir"/queued_jobs_tmp" $rendir"/queued_jobs"
#update host entry 1 - that is one
#count idle $idle_mach 
          touch ${rendir}/nodeflag_2
          echo 1 > ${rendir}/nodeturn
          set turn = `cat ${rendir}/nodeturn`
          while ( ( -e ${rendir}/nodeflag_1 ) && ( $turn == '1' ) )
            set turn = `cat ${rendir}/nodeturn`
          end

          set no_entries = `egrep "$mac_ent" $rendir"/current_nodes" | awk 'BEGIN {} {s+=1;} END {print s}' s=0 -`
          @ no_entries -= 1
          egrep -v "$mac_ent" $rendir"/current_nodes" >! $rendir"/current_nodes_tmp"
          while ( $no_entries > 0 )
            echo "$mac_ent" >> $rendir"/current_nodes_tmp"
            @ no_entries -= 1
          end
          set mac_ent[1] = runs
          set mac_ent[3] = $pid_num
          echo "$mac_ent" >> $rendir"/current_nodes_tmp"
          mv $rendir"/current_nodes_tmp" $rendir"/current_nodes"

          /bin/rm -f ${rendir}/nodeflag_2

#remove $rendir"/"$pid_fil
#echo ${idle_mach}${pid_fil} `cat $rendir"/"${idle_mach}${pid_fil}` >> ${logfile}
          /bin/rm -f $rendir"/"${idle_mach}${pid_fil}
          set anothertry = 0
          set starterror = 0

        else if ( $anothertry == 0 ) then
          set starterror = 1
        endif
      end 

      if ( $starterror ) then
#idle_mach is reachable, but an error occured during startup gig_sa
        echo $0 an error occured on $idle_mach during startup gig_sa >> ${logfile}
        echo $0 deleting $idle_mach from list of current servers >> ${logfile}
        gigrs_delnodes_p $idle_mach
        exit

      endif

    else

      touch ${rendir}/nodeflag_2
      echo 1 > ${rendir}/nodeturn
      set turn = `cat ${rendir}/nodeturn`
      while ( ( -e ${rendir}/nodeflag_1 ) && ( $turn == '1' ) )
        set turn = `cat ${rendir}/nodeturn`
      end

      set no_entries = `egrep "$mac_ent" $rendir"/current_nodes" | awk 'BEGIN {} {s+=1;} END {print s}' s=0 -`
      egrep -v "$mac_ent" $rendir"/current_nodes" >! $rendir"/current_nodes_tmp"
      /bin/mv $rendir"/current_nodes_tmp" $rendir"/current_nodes"

      /bin/rm -f ${rendir}/nodeflag_2

      if ( !(-e $rendir"/current_nodes_pst" ) ) then
        touch $rendir"/current_nodes_pst"
      endif
      while ( $no_entries > 0 )
        echo "$mac_ent" >> $rendir"/current_nodes_pst"
        @ no_entries -= 1
      end
      set re_enter = 1

    endif

    set idle_ent = `fgrep idle $rendir"/current_nodes" | awk '{print NF; exit}'`

  else

    set idle_ent = ''

  endif

@ knt += 1
end

# reenter all entries in current_nodes that are idle but were not
# used because of current time is not in renderhours

if ( $re_enter > 0 ) then

touch ${rendir}/nodeflag_2
echo 1 > ${rendir}/nodeturn
set turn = `cat ${rendir}/nodeturn`
while ( ( -e ${rendir}/nodeflag_1 ) && ( $turn == '1' ) )
  set turn = `cat ${rendir}/nodeturn`
end

/bin/mv ${rendir}/current_nodes ${rendir}/current_nodes_tmp
cat ${rendir}/current_nodes_pst ${rendir}/current_nodes_tmp > ${rendir}/current_nodes

/bin/rm -f ${rendir}/nodeflag_2
/bin/rm -f ${rendir}/current_nodes_pst
  
endif
