#!/bin/bash
#
# diagnose_holds.sh - script to examine condor jobs that are on hold
#                     and look in the logs for the reasons why.
#
# author: richard.t.jones at uconn.edu
# version: november 16, 2024

logd="$(dirname $0)/../log*.d"

for jobid in $(condor_q -hold | awk '{print $1}'); do
    if echo $jobid | grep -q '^[0-9]'; then
        echo -n "$jobid: "
        if grep -q 'Error .* cannot fetch' $logd/stdout.$jobid 2>/dev/null; then
            host=$(awk '/running on /{print $NF}' $logd/stdout.$jobid)
            echo "job $jobid running on $host failed to fetch input files, ignoring..."
            continue
        elif grep 'running' $logd/stdout.$jobid; then
            echo -n "       "
            if grep 'Error' $logd/stdout.$jobid; then
                continue
            fi
        else
            echo "job $jobid did not return its logs, ignoring..."
            continue
        fi
    fi
done
