#!/bin/bash
#
# diagnose_holds.sh - script to examine condor jobs that are on hold
#                     and look in the logs for the reasons why.
#
# author: richard.t.jones at uconn.edu
# version: november 16, 2024

logd="$(dirname $0)/../log*.d"

condor_q -hold | while read -r line; do
    jobid=$(echo $line | awk '{print $1}')
    if ! echo $jobid | grep -q '[1-9][0-9]*\.[1-9][0-9]*'; then
        continue
    fi
    date_evicted=$(echo $line | awk '{print $3}')
    time_evicted=$(echo $line | awk '{print $4}')
    worker=$(echo $line | awk -F[@:] '{print $3}')
    reason=$(echo $line | awk -F: '{print $3}')
    if [ "$reason" = "" ]; then
        echo -n "$jobid: "
        if grep -q 'Error .* cannot fetch' $logd/stdout.$jobid 2>/dev/null; then
            host=$(awk '/running on /{print $NF}' $logd/stdout.$jobid)
            echo "job $jobid running on $host failed to fetch input files, ignoring..."
            continue
        elif grep 'running' $logd/stdout.$jobid; then
            echo -n "       "
            if grep 'Error' $logd/stdout.$jobid; then
                continue
            fi
        else
            echo "job $jobid did not return its logs, ignoring..."
            continue
        fi
    else
        echo "job $jobid evicted on $date_evicted at $time_evicted from $worker because $reason"
    fi
done

# errors from condor_q -hold look like this
# 10649.11976 jonesrt         2/15 02:04 Error from slot1_5@UChicago-Pile-Backfill.c007.pile.uchicago.edu: disk usage exceeded request_disk
# 10649.13155 jonesrt         2/15 02:27 The job attribute OnExitHold expression '(ExitBySignal == false) && (ExitCode != 0)' evaluated to TRUE
