#!/bin/bash
#
# recycle_holds.sh - script to examine condor jobs that are on hold
#                    and see if they ever completed, either with error
#                    or success, and if not send them back to idle.
#
# author: richard.t.jones at uconn.edu
# version: september 26, 2024

logd="$(dirname $0)/../log*.d"

for jobid in $(condor_q -hold | awk '{print $1}'); do
    if echo $jobid | grep -q '^[0-9]'; then
        echo -n "$jobid: "
        if grep -q 'Error .* cannot fetch' $logd/stdout.$jobid 2>/dev/null; then
            host=$(awk '/running on /{print $NF}' $logd/stdout.$jobid)
            echo "job $jobid running on $host unable to fetch inputs, recycle"
            condor_release $jobid
        elif grep 'running' $logd/stdout.$jobid; then
            echo -n "       "
            if grep 'Error' $logd/stdout.$jobid; then
                continue
            fi
        else
            echo "job $jobid did not return its logs, recycle"
            condor_release $jobid
        fi
    fi
done
